Source code for ccdc.search

#
# This code is Copyright (C) 2015 The Cambridge Crystallographic Data Centre
# (CCDC) of 12 Union Road, Cambridge CB2 1EZ, UK and a proprietary work of CCDC.
# This code may not be used, reproduced, translated, modified, disassembled or
# copied, except in accordance with a valid licence agreement with CCDC and may
# not be disclosed or redistributed in any form, either in whole or in part, to
# any third party. All copies of this code made in accordance with a valid
# licence agreement as referred to above must contain this copyright notice.
#
# No representations, warranties, or liabilities are expressed or implied in the
# supply of this code by CCDC, its servants or agents, except where such
# exclusion or limitation is prohibited, void or unenforceable under governing
# law.
#
'''
The :mod:`ccdc.search` module provides various search classes.

The main classes of the :mod:`ccdc.search` module are:

- :class:`ccdc.search.TextNumericSearch`
- :class:`ccdc.search.SubstructureSearch`
- :class:`ccdc.search.SimilaritySearch`
- :class:`ccdc.search.ReducedCellSearch`
- :class:`ccdc.search.CombinedSearch`

These all inherit from the base class :class:`ccdc.search.Search`. The base
:class:`ccdc.search.Search` contains nested classes defining basic search hits
and settings:

- :class:`ccdc.search.Search.SearchHit`
- :class:`ccdc.search.Search.Settings`

The base class :class:`ccdc.search.Search` also contains the
:func:`ccdc.search.Search.search` function which is used to search the CSD.

All the searches except :class:`ccdc.search.TextNumericSearch` also support
searching of the following additional data sources:

- a Python list of identifiers
- a molecule file path
- a :mod:`ccdc.io` reader
- an individual :class:`ccdc.molecule.Molecule`
- an individual :class:`ccdc.crystal.Crystal`
- a list of molecules, crystals or entries

The :class:`ccdc.search.TextNumericSearch` can only sensibly be applied to
a crystal structure database, which is the CSD by default or a :class:`ccdc.io.EntryReader`
opened on a database file.

The :func:`ccdc.search.Search.search` returns a list of
:class:`ccdc.search.Search.SearchHit` instances. Some of the searches make use
of more specific search hit classes, namely:

- :class:`ccdc.search.TextNumericSearch.TextNumericHit`
- :class:`ccdc.search.SubstructureSearch.SubstructureHit`
- :class:`ccdc.search.SimilaritySearch.SimilarityHit`

Most of the searches return simple Python lists of search hits. However,
a search carried out using a :class:`ccdc.search.SubstructureSearch` returns a
:class:`ccdc.search.SubstructureSearch.SubstructureHitList`, which contains a
:func:`ccdc.search.SubstructureSearch.SubstructureHitList.superimpose` function for superimposing
all the hits on the first instance in the list.

To illustrate some of the searches let us first get an aspirin molecule.

>>> from ccdc.io import EntryReader
>>> csd_reader = EntryReader('CSD')
>>> mol = csd_reader.molecule('ACSALA')

Text numeric searching.

>>> from ccdc.search import TextNumericSearch
>>> text_numeric_search = TextNumericSearch()
>>> text_numeric_search.add_compound_name('aspirin')
>>> hits = text_numeric_search.search()
>>> len(hits)
102

Substructure searching.

>>> from ccdc.search import MoleculeSubstructure, SubstructureSearch
>>> substructure = MoleculeSubstructure(mol)
>>> substructure_search = SubstructureSearch()
>>> _ = substructure_search.add_substructure(substructure)
>>> hits = substructure_search.search()
>>> len(hits)
66

Similarity searching.

>>> from ccdc.search import SimilaritySearch
>>> similarity_search = SimilaritySearch(mol)
>>> hits = similarity_search.search()
>>> len(hits)
113

Reduced cell searching.

>>> from ccdc.search import ReducedCellSearch
>>> crystal = csd_reader.crystal('ACSALA')
>>> query = ReducedCellSearch.CrystalQuery(crystal)
>>> reduced_cell_searcher = ReducedCellSearch(query)
>>> hits = reduced_cell_searcher.search()
>>> len(hits)
17

Combined searches.

>>> from ccdc.search import CombinedSearch
>>> combined_search = CombinedSearch(similarity_search & -text_numeric_search)
>>> hits = combined_search.search()
>>> len(hits)
33

'''
###########################################################################

import sys
import os
import math
import re
import collections
import operator
import warnings
warnings.filterwarnings('always', '.*deprecated.*', DeprecationWarning, '.*', 0)

from ccdc import molecule, io
from ccdc.entry import Entry
from ccdc.crystal import Crystal
from ccdc.io import (
    _CSDDatabaseLocator, _DatabaseReader, CrystalReader,
    EntryReader
)
from ccdc.descriptors import MolecularDescriptors, GeometricDescriptors
from ccdc import utilities

from ccdc import maxint32

from ccdc.utilities import _private_importer
with _private_importer() as pi:
    pi.import_ccdc_module('UtilitiesLib')
    pi.import_ccdc_module('MathsLib')
    pi.import_ccdc_module('ChemistryLib')
    pi.import_ccdc_module('SubstructureSearchLib')
    pi.import_ccdc_module('DatabaseEntryLib')
    pi.import_ccdc_module('CSDSQLDatabaseLib')
    pi.import_ccdc_module('MotifSearchLib')
    pi.import_ccdc_module('ProteinLib')
    pi.import_ccdc_module('FileFormatsLib')
    pi.import_ccdc_module('AnnotationsLib')
    pi.import_ccdc_module('SolubilityPlatformLib')

###########################################################################
#   Queries
###########################################################################


def _decode_condition(r):
    '''PRIVATE: work out the condition from the argument.'''
    if isinstance(r, (int, float)):
        crit = SubstructureSearchLib.EqualTo(r)
    elif isinstance(r, (tuple, list)):
        a = r[0]
        if isinstance(a, (int, float)):
            if len(r) == 2 and isinstance(r[1], (int, float)):
                crit = SubstructureSearchLib.InclusiveRange(min(a, r[1]), max(a, r[1]))
            else:
                raise TypeError('Invalid type for condition %s' % r)
        elif isinstance(a, str):
            op = a.strip()
            if op == '==':
                crit = SubstructureSearchLib.EqualTo(r[1])
            elif op == '>':
                crit = SubstructureSearchLib.Greater(r[1])
            elif op == '<':
                crit = SubstructureSearchLib.Less(r[1])
            elif op == '>=':
                crit = SubstructureSearchLib.GreaterEqual(r[1])
            elif op == '<=':
                crit = SubstructureSearchLib.LessEqual(r[1])
            elif op == '!=':
                crit = SubstructureSearchLib.NotEqualTo(r[1])
            elif op == 'in':
                crit = SubstructureSearchLib.OneOf(r[1])
            else:
                raise TypeError('Invalid operator for condition %s' % r)
        else:
            raise TypeError('Invalid value for condition %s' % r)
    return crit

def _constraint_property(which, doc, nullary=False):
    '''Private: make a property from a class.'''
    return property(
        lambda x: x._get_constraint(which),
        lambda x, value, nullary=nullary: x._set_constraint(which, value, nullary=nullary),
        None,
        doc
    )

[docs]class QueryAtom(object): '''Atom used to define a substructure search. A QueryAtom can be used to represent a single atom type or a set of atom types. A QueryAtom can also have additional constraints imposed on it, for example that it should be aromatic. Let us create a query atom representing an oxygen atom. >>> query_atom = QueryAtom('O') >>> print(query_atom) QueryAtom(O) Suppose that we wanted the query atom to be either a carbon or a nitrogen atom. >>> query_atom = QueryAtom(['C', 'N']) >>> print(query_atom) QueryAtom(C, N) It is possible to add further constraints on a QueryAtom. For, example, we can insist that it should be aromatic. >>> query_atom.aromatic = True >>> print(query_atom.aromatic) AtomAromaticConstraint: 1 >>> print(query_atom) QueryAtom(C, N)[atom aromaticity: equal to 1] See :ref:`query_atoms` for further details. ''' def __init__(self, atomic_symbol='', _substructure_atom=None): '''Initialiser. :param atomic_symbol: an atomic symbol or a list or tuple of atomic symbols. The resulting QueryAtom will match any of the provided symbols. ''' if _substructure_atom is not None: self._substructure_atom = _substructure_atom else: if atomic_symbol: if isinstance(atomic_symbol, (list, tuple)): self._substructure_atom = SubstructureSearchLib.SubstructureAtom( ChemistryLib.Element(atomic_symbol[0]) ) for sym in atomic_symbol[1:]: self._substructure_atom.add_element( ChemistryLib.Element(sym) ) elif isinstance(atomic_symbol, molecule.Atom): self._substructure_atom = SubstructureSearchLib.SubstructureAtom( atomic_symbol._atom.element() # pylint: disable=E1103 ) else: self._substructure_atom = SubstructureSearchLib.SubstructureAtom( ChemistryLib.Element(atomic_symbol) ) else: self._substructure_atom = SubstructureSearchLib.SubstructureAtom() def __str__(self): '''String representation of a QueryAtom. >>> q = QueryAtom(['C', 'N']) >>> print(q) QueryAtom(C, N) ''' l = ['QueryAtom('] if not self._substructure_atom.matches_any_element(): for i in range(self._substructure_atom.nelements()): if i: l.append(', ') l.append(self._substructure_atom.element(i).atomic_symbol()) l.append(')') if self._substructure_atom.nconstraints(): l.append('[') for i in range(self._substructure_atom.nconstraints()): if i: l.append(', ') l.append(str(self._substructure_atom.constraint(i)).strip('\n')) l.append(']') return ''.join(l).replace('\t', ' ') __repr__ = __str__ def __eq__(self, other): '''Return True if the underlying atoms have the same memory location.''' return isinstance(other, QueryAtom) and self._substructure_atom == other._substructure_atom def __ne__(self, other): '''Inequality for atoms.''' return not self == other @property def index(self): '''Index of this atom in a substructure. >>> atom = QueryAtom(['C', 'N']) >>> print(atom.index) None >>> substructure = QuerySubstructure() >>> _ = substructure.add_atom(atom) >>> print(atom.index) 0 ''' try: return self._substructure_atom.index() except RuntimeError: return None def _get_constraint(self, which, boolean=False): '''Private: get a printable representation of a constraint.''' ty = which() if self._substructure_atom.has_constraint_of_type(ty): k = self._substructure_atom.constraint_of_type(ty) if which == SubstructureSearchLib.AtomHas3DSiteConstraint: cond = k.get_site_option() elif which == SubstructureSearchLib.AtomLabelConstraint: cond = k.regular_expression() else: cond = k.condition() return '%s: %s' % (which.__name__, cond) return None def _set_constraint(self, which, value, nullary=False): '''Private: set a constraint. Removes the constraint if value is None''' if self._substructure_atom.has_constraint_of_type(which()): self._substructure_atom.remove_constraints_of_type(which()) if value is None: return if which == SubstructureSearchLib.AtomHas3DSiteConstraint: cond = bool(value) constraint = which() constraint.set_site_option(cond) elif which == SubstructureSearchLib.AtomLabelConstraint: constraint = which() constraint.set_regular_expression(value) elif nullary: constraint = which() elif value in (True, False): cond = SubstructureSearchLib.EqualTo(value) constraint = which(cond) else: cond = _decode_condition(value) constraint = which(cond) self._substructure_atom.add_constraint(constraint) acceptor = _constraint_property( SubstructureSearchLib.AtomAcceptorTypeConstraint, '''Constraint specifying whether or not the QueryAtom is an acceptor. >>> a = QueryAtom(['C', 'N']) >>> a.acceptor = True >>> print(a) QueryAtom(C, N)[AtomAcceptorTypeConstraint] ''', nullary=True ) aromatic = _constraint_property( SubstructureSearchLib.AtomAromaticConstraint, '''Constraint specifying whether or not the QueryAtom is aromatic. >>> a = QueryAtom(['C', 'N']) >>> a.aromatic = True >>> print(a) QueryAtom(C, N)[atom aromaticity: equal to 1] ''' ) donor = _constraint_property( SubstructureSearchLib.AtomDonorTypeConstraint, '''Constraint specifying whether or not the QueryAtom is a donor. >>> a = QueryAtom(['C', 'N']) >>> a.donor = True >>> print(a) QueryAtom(C, N)[AtomDonorTypeConstraint] ''', nullary=True ) cyclic = _constraint_property( SubstructureSearchLib.AtomCyclicityConstraint, '''Constraint specifying whether or not the QueryAtom is part of a cycle. >>> a = QueryAtom(['C', 'N']) >>> a.cyclic = True >>> print(a) QueryAtom(C, N)[atom cyclicity: equal to 1] ''' ) formal_charge = _constraint_property( SubstructureSearchLib.AtomFormalChargeConstraint, '''Constraint specifying the formal charge on the QueryAtom. >>> a = QueryAtom(['C', 'N']) >>> a.formal_charge = ('in', [-1, 1]) >>> print(a) QueryAtom(C, N)[charge: one of -1, 1] ''' ) formal_valency = _constraint_property( SubstructureSearchLib.AtomFormalValencyConstraint, '''Constraint specifying the formal valency of the QueryAtom. >>> a = QueryAtom(['C', 'N']) >>> a.formal_valency = ('>', 3) >>> print(a) QueryAtom(C, N)[atom valency: greater than 3] ''' ) cyclic_bonds = _constraint_property( SubstructureSearchLib.AtomNCyclicBondsConstraint, '''Constraint specifying the number of cyclic bonds of the QueryAtom. >>> a = QueryAtom(['C', 'N']) >>> a.cyclic_bonds = ('!=', 4) >>> print(a) QueryAtom(C, N)[number of cyclic bonds:not equal to 4] ''' ) smallest_ring = _constraint_property( SubstructureSearchLib.AtomSmallestRingConstraint, '''Constraint specifying the size of the smallest ring the QueryAtom forms part of. >>> a = QueryAtom(['C', 'N']) >>> a.smallest_ring = (5, 6) >>> print(a) QueryAtom(C, N)[atom smallest ring: in range 5 to 6] ''' ) num_bonds = _constraint_property( SubstructureSearchLib.AtomNBondsConstraint, '''Constraint specifying the number of bonds the QueryAtom may have. >>> a = QueryAtom(['C', 'N']) >>> a.num_bonds = ('<=', 3) >>> print(a) QueryAtom(C, N)[number of connected atoms: less than or equal to 3] ''' ) num_hydrogens = _constraint_property( SubstructureSearchLib.AtomNHydrogensConstraint, '''Constraint specifying the number of hydrogens the QueryAtom may have. >>> a = QueryAtom(['C', 'N']) >>> a.num_hydrogens = 1 >>> print(a) QueryAtom(C, N)[hydrogen count, including deuterium: equal to 1] ''' ) unfused_unbridged_ring = _constraint_property( SubstructureSearchLib.AtomUnfusedUnbridgedRingConstraint, '''Constraint specifying whether or not the QueryAtom is part of an unfused and unbridged ring. >>> a = QueryAtom(['C', 'N']) >>> a.unfused_unbridged_ring = True >>> print(a) QueryAtom(C, N)[atom unfused/unbridged ring: equal to 1] ''' ) nimplicit_hydrogens = _constraint_property( SubstructureSearchLib.AtomNImplicitHydrogensConstraint, '''Constraint specifying a count of implicit hydrogens. >>> a = QueryAtom(['C', 'N']) >>> a.nimplicit_hydrogens = 0 >>> print(a) QueryAtom(C, N)[implicit hydrogen count: equal to 0] ''' ) has_3d_coordinates = _constraint_property( SubstructureSearchLib.AtomHas3DSiteConstraint, '''Constraint specifying that the atom has 3d coordinates. >>> a = QueryAtom(['C', 'N']) >>> a.has_3d_coordinates = True >>> print(a) QueryAtom(C, N)[atom must have 3D site] ''', nullary=True ) label_match = _constraint_property( SubstructureSearchLib.AtomLabelConstraint, '''Constraint specifying that the atom label must match a regular expression. >>> a = QueryAtom(['C']) >>> a.label_match = '^C12$' >>> print(a) QueryAtom(C)[atom label must match regular expression with pattern: ^C12$] ''', nullary=True ) @property def chirality(self): '''Constraint specifying the chirality around an atom. The return value will either be None or a tuple of 4 QueryAtoms in clockwise order. >>> s = SMARTSSubstructure("FC(I)O[C@](S)(P)H") >>> s.atoms[1].chirality is None True >>> s.atoms[4].chirality (QueryAtom(O)[atom aromaticity: equal to 0], QueryAtom(H), QueryAtom(P)[atom aromaticity: equal to 0], QueryAtom(S)[atom aromaticity: equal to 0]) ''' rs = SubstructureSearchLib.get_chirality(self._substructure_atom) if rs.atom() == self._substructure_atom: atoms = rs.ordered_bound_atoms() query_atoms = (QueryAtom(_substructure_atom=a) for a in atoms) return tuple(query_atoms) return None @chirality.setter def chirality(self, chirality): '''Constraint specifying the chirality around an atom. The set value may be None to clear a chirality constraint, or a tuple of 4 ordered atoms and optionally a string 'clockwise' (the assumed default) or 'anticlockwise' specifying the chiral relationship. >>> s = SMARTSSubstructure("O[C@](I)(F)H") >>> s.atoms[1].chirality = None >>> s.atoms[1].chirality is None True >>> s.atoms[1].chirality = (s.atoms[0],s.atoms[2],s.atoms[3],s.atoms[4]) >>> s.atoms[1].chirality (QueryAtom(O)[atom aromaticity: equal to 0], QueryAtom(I), QueryAtom(F), QueryAtom(H)) ''' if chirality is None: SubstructureSearchLib.remove_chirality(self._substructure_atom) return if len(chirality) >= 5: if chirality[4] == "anticlockwise": chirality = (chirality[0], chirality[1], chirality[3], chirality[2]) elif chirality[4] != "clockwise": raise RuntimeError("Chirality description must be 'clockwise' or 'anticlockwise'") chirality = chirality[0:4] if not all(isinstance(atom, QueryAtom) for atom in chirality): raise RuntimeError("QueryAtom chirality must be set to 4 QueryAtoms or None") atoms = [a._substructure_atom for a in chirality] SubstructureSearchLib.set_chirality(self._substructure_atom, atoms[0], atoms[1], atoms[2], atoms[3])
[docs] def add_connected_element_count(self, atomic_symbols, count): '''Set the number of connected elements constraint. Constraint to define the number of times the QueryAtom should be connected to atoms with elements defined in the atomic_symbols list. :param atomic_symbols: atomic symbol or list of atomic symbols. :param count: see :ref:`conditions` for details. >>> a = QueryAtom(['C', 'N']) >>> a.add_connected_element_count(['F', 'Cl'], 2) >>> print(a) QueryAtom(C, N)[count connected elements equal to 2 from [F,Cl]] ''' x = ChemistryLib.ElementSet() if isinstance(atomic_symbols, list): for s in atomic_symbols: x.add_element(ChemistryLib.Element(s)) else: x.add_element(ChemistryLib.Element(atomic_symbols)) if isinstance(count, list): cond = SubstructureSearchLib.InclusiveRange(count[0], count[1]) else: cond = SubstructureSearchLib.EqualTo(count) constraint = SubstructureSearchLib.AtomConnectedElementCountConstraint(x, cond) if self._substructure_atom.has_constraint_of_type(constraint): self._substructure_atom.remove_constraints_of_type(constraint) self._substructure_atom.add_constraint(constraint)
[docs] def add_protein_atom_type_constraint(self, *types): '''Add a constraint that an atom be in one of the protein atom types. This is of use only when searching a protein structure. :param `*types`: one or more of 'AMINO_ACID', 'LIGAND', 'COFACTOR', 'WATER', 'METAL', 'NUCLEOTIDE', 'UNKNOWN'. Any case-insensitive, unique prefix may be used. >>> a = QueryAtom('Zn') >>> a.add_protein_atom_type_constraint('Ligand', 'Metal') >>> print(a) QueryAtom(Zn)[protein substructure type : one of 1, 3] ''' _type_dict = utilities.bidirectional_dict( AMINO_ACID=AnnotationsLib.ProteinSubstructureData.AMINOACID, LIGAND=AnnotationsLib.ProteinSubstructureData.LIGAND, COFACTOR=AnnotationsLib.ProteinSubstructureData.COFACTOR, WATER=AnnotationsLib.ProteinSubstructureData.WATER, METAL=AnnotationsLib.ProteinSubstructureData.METAL, NUCLEOTIDE=AnnotationsLib.ProteinSubstructureData.NUCLEOTIDE, UNKNOWN=AnnotationsLib.ProteinSubstructureData.UNKNOWN ) indices = [_type_dict.prefix_lookup(t) for t in types] if len(indices) == 1: cond = SubstructureSearchLib.EqualTo(indices[0]) else: cond = SubstructureSearchLib.OneOf(indices) self._substructure_atom.add_constraint(ProteinLib.ProteinSubstructureTypeAtomConstraint(cond))
[docs]class QueryBond(object): '''Bond used to define a substructure search. A QueryBond can be used to represent a single bond type or a set of bond types. A QueryBond can also have additional constraints imposed on it, for example that it should be cyclic. Let us create a QueryBond that will match any bond type. >>> query_bond = QueryBond() >>> print(query_bond) # doctest: +NORMALIZE_WHITESPACE QueryBond(Unknown, Single, Double, Triple, Quadruple, Aromatic, Delocalised, Pi) To create a more specific QueryBond we need to specify some bond types. >>> from ccdc.molecule import Bond >>> single_bond = Bond.BondType('Single') >>> double_bond = Bond.BondType('Double') >>> query_bond = QueryBond(single_bond) >>> print(query_bond) QueryBond(Single) >>> query_bond = QueryBond([single_bond, double_bond]) >>> print(query_bond) # doctest: +NORMALIZE_WHITESPACE QueryBond(Single, Double) Finally, let us set a constraint for the bond to be cyclic. >>> query_bond.cyclic = True >>> print(query_bond) QueryBond(Single, Double)[bond cyclicity: equal to 1] >>> print(query_bond.cyclic) BondCyclicityConstraint: 1 ''' def __init__(self, bond_type=None, _substructure_bond=None): '''Initialise a QueryBond. :param bond_type: may be None, for a :class:`QueryBond` that will match any bond, a :class:`ccdc.molecule.Bond.BondType` instance which will match only that bond type, a string representation which will match only that bond type, 'any' that will match any bond, or a list of :class:`ccdc.molecule.Bond.BondType` which will match any of those specified. ''' if _substructure_bond is not None: self._substructure_bond = _substructure_bond else: if bond_type is None: self._substructure_bond = SubstructureSearchLib.SubstructureBond() elif isinstance(bond_type, (list, tuple)): if len(bond_type): if isinstance(bond_type[0], str): b = molecule.Bond.BondType(bond_type[0])._bond_type else: b = bond_type[0]._bond_type self._substructure_bond = SubstructureSearchLib.SubstructureBond( b ) for b in bond_type[1:]: if isinstance(b, str): bt = molecule.Bond.BondType(b)._bond_type else: bt = b._bond_type self._substructure_bond.add_type(bt) else: self._substructure_bond = SubstructureSearchLib.SubstructureBond() elif isinstance(bond_type, str): if bond_type.lower() == 'any': self._substructure_bond = SubstructureSearchLib.SubstructureBond() else: self._substructure_bond = SubstructureSearchLib.SubstructureBond( molecule.Bond.BondType(bond_type)._bond_type ) else: self._substructure_bond = SubstructureSearchLib.SubstructureBond( bond_type._bond_type ) def __str__(self): '''String representation of a QueryBond. >>> b = QueryBond(['Single', 'Double']) >>> print(b) QueryBond(Single, Double) ''' l = ['QueryBond('] for i in range(self._substructure_bond.ntypes()): if i: l.append(', ') l.append(str(molecule.Bond.BondType(self._substructure_bond.type(i)))) l.append(')') if self._substructure_bond.nconstraints(): l.append('[') for i in range(self._substructure_bond.nconstraints()): if i: l.append(', ') l.append(str(self._substructure_bond.constraint(i))) l.append(']') return ''.join(l) __repr__ = __str__ @property def atoms(self): '''A list of the two QueryAtoms of the bond, if it is in a substructure, or ``None``. >>> s = QuerySubstructure() >>> c = s.add_atom(QueryAtom('C')) >>> n = s.add_atom(QueryAtom('N')) >>> b = QueryBond(['Single', 'Double']) >>> _ = s.add_bond(b, c, n) >>> print(b) QueryBond(Single, Double) >>> print('%s, %s' % (b.atoms[0], b.atoms[1])) QueryAtom(C), QueryAtom(N) ''' try: return [ QueryAtom(_substructure_atom=self._substructure_bond.atom1()), QueryAtom(_substructure_atom=self._substructure_bond.atom2()) ] except RuntimeError: return None def _get_constraint(self, which): '''Private: get a string representation of a bond constraint.''' ty = which() if self._substructure_bond.has_constraint_of_type(ty): k = self._substructure_bond.constraint_of_type(ty) cond = k.condition() return '%s: %s' % (which.__name__, cond) return None def _set_constraint(self, which, value, nullary=False): '''Private: set a bond constraint.''' if self._substructure_bond.has_constraint_of_type(which()): self._substructure_bond.remove_constraints_of_type(which()) if value is None: return if nullary or value in (True, False): cond = SubstructureSearchLib.EqualTo(value) else: cond = _decode_condition(value) constraint = which(cond) self._substructure_bond.add_constraint(constraint) cyclic = _constraint_property( SubstructureSearchLib.BondCyclicityConstraint, '''Constraint specifying whether or not the :class:`QueryBond` is part of a cycle. >>> b = QueryBond('Single') >>> b.cyclic = True >>> print(b) QueryBond(Single)[bond cyclicity: equal to 1] ''' ) bond_length = _constraint_property( SubstructureSearchLib.BondLengthConstraint, '''Constraint specifying the length of the bond. >>> b = QueryBond('Single') >>> c1 = QueryAtom('C') >>> c2 = QueryAtom('C') >>> s = QuerySubstructure() >>> _ = s.add_atom(c1) >>> _ = s.add_atom(c2) >>> _ = s.add_bond(b, c1, c2) >>> b.bond_length = ('>', 1.6) >>> print(b) QueryBond(Single)[bond length: greater than 1.6] ''' ) bond_polymeric = _constraint_property( SubstructureSearchLib.BondPolymericConstraint, '''Constraint specifying whether or not the :class:`QueryBond` is polymeric. >>> b = QueryBond('Single') >>> b.bond_polymeric = True >>> print(b) QueryBond(Single)[bond polymeric: equal to 1] ''' ) bond_smallest_ring = _constraint_property( SubstructureSearchLib.BondSmallestRingConstraint, '''Constraint specifying the smallest ring the bond should be a part of. >>> b = QueryBond('Aromatic') >>> b.bond_smallest_ring = 5 >>> print(b) QueryBond(Aromatic)[bond smallest ring: equal to 5] ''' ) bond_unfused_unbridged_ring = _constraint_property( SubstructureSearchLib.BondUnfusedUnbridgedRingConstraint, '''Constraint specifying whether or not the :class:`QueryBond` is part of an unfused and unbridged ring. >>> b = QueryBond('Single') >>> b.bond_unfused_unbridged_ring = True >>> print(b) QueryBond(Single)[bond unfused/unbridged ring: equal to 1] ''' ) @property def stereochemistry(self): r'''Constraint specifying the stereochemistry around a double bond. The return value will either be None or a tuple of 2 QueryAtoms and one of 'cis' or 'trans'. >>> s = SMARTSSubstructure(r"I/C=C\F") >>> s.bonds[1].stereochemistry (QueryAtom(I), QueryAtom(F), 'cis') ''' ez = SubstructureSearchLib.get_stereochemistry(self._substructure_bond) if self._substructure_bond.is_same_bond(ez.bond()): if ez.stereochemistry() == SubstructureSearchLib.EZStereoChemistryFlag_E_STEREOCHEMISTRY: stereo = "trans" elif ez.stereochemistry() == SubstructureSearchLib.EZStereoChemistryFlag_Z_STEREOCHEMISTRY: stereo = "cis" else: return None return (QueryAtom(_substructure_atom=ez.adjacent_to_first()), QueryAtom(_substructure_atom=ez.adjacent_to_second()), stereo) return None @stereochemistry.setter def stereochemistry(self, stereo): '''Set a stereochemistry constraint on a bond. The set value may be None to remove stereochemistry, or a tuple of 2 atoms adjacent to the bond's atoms and a string either 'cis' or 'trans' >>> s = SMARTSSubstructure(R"IC=CF") >>> s.bonds[1].stereochemistry = (s.atoms[0], s.atoms[3], 'trans') >>> s.bonds[1].stereochemistry (QueryAtom(I), QueryAtom(F), 'trans') ''' if stereo is None: SubstructureSearchLib.remove_stereochemistry(self._substructure_bond) return adj1, adj2, flag = stereo if flag == "cis": flag = SubstructureSearchLib.EZStereoChemistryFlag_Z_STEREOCHEMISTRY elif flag == "trans": flag = SubstructureSearchLib.EZStereoChemistryFlag_E_STEREOCHEMISTRY else: raise RuntimeError("stereochemistry flag must be either 'cis' or 'trans'") ez = SubstructureSearchLib.SubstructureEZStereoChemistry(flag, self._substructure_bond, adj1._substructure_atom, adj2._substructure_atom) SubstructureSearchLib.set_stereochemistry(ez)
###########################################################################
[docs]class QuerySubstructure(object): '''Class to define and run substructure searches. As an example let us set up a QuerySubstructure for a carbonyl (C=O). >>> from ccdc.molecule import Bond >>> double_bond = Bond.BondType('Double') >>> substructure_query = QuerySubstructure() >>> query_atom1 = substructure_query.add_atom('C') >>> query_atom2 = substructure_query.add_atom('O') >>> query_bond = substructure_query.add_bond(double_bond, query_atom1, query_atom2) ''' def __init__(self, _substructure=None): '''Create a substructure. If the _substructure parameter is set it should be a SubstructureSearchLib.Substructure. ''' if _substructure is None: self._substructure = SubstructureSearchLib.Substructure.instantiate() else: self._substructure = _substructure self._searcher = None self.measurements = [] self._constraints = None self._geometric_constraints = None self._geometric_objects = None
[docs] def clear(self): '''Restart the query.''' self._substructure = SubstructureSearchLib.Substructure.instantiate() self._searcher = None self.measurements = [] self._constraints = None self._geometric_constraints = None self._geometric_objects = None
[docs] def add_atom(self, atom): '''Add an atom to the substructure. :param atom: may be a QueryAtom separately constructed, an atom of a molecule, or an atomic symbol. :returns: :class:`QueryAtom` >>> q = QuerySubstructure() >>> a = q.add_atom(QueryAtom(['N', 'O'])) >>> print(a) QueryAtom(N, O) ''' if isinstance(atom, QueryAtom): at = atom elif isinstance(atom, molecule.Atom): at = QueryAtom(atom.atomic_symbol) else: at = QueryAtom(atom) self._substructure.add(at._substructure_atom) return at
@property def atoms(self): '''The query atoms in the substructure. >>> q = QuerySubstructure() >>> _ = q.add_atom(QueryAtom('C')) >>> _ = q.add_atom(QueryAtom(['O', 'N'])) >>> atoms = q.atoms >>> print('%s, %s' % (atoms[0], atoms[1])) QueryAtom(C), QueryAtom(N, O) ''' return [ QueryAtom(_substructure_atom=self._substructure.atom(i)) for i in range(self._substructure.natoms()) ]
[docs] def add_bond(self, bond, atom1=None, atom2=None): '''Add a bond to the substructure. :param bond: may be a :class:`QueryBond`, a :class:`ccdc.molecule.Bond.BondType`, a :class:`ccdc.molecule.Bond`, a string or an int. :param atom1: :class:`QueryAtom` or ``None`` for any atom :param atom2: :class:`QueryAtom` or ``None`` for any atom :returns: :class:`QueryBond` :raises: TypeError if an improper bond argument is supplied >>> s = QuerySubstructure() >>> c = s.add_atom(QueryAtom('C')) >>> o1 = s.add_atom(QueryAtom('O')) >>> o2 = s.add_atom(QueryAtom('O')) >>> h = s.add_atom(QueryAtom('H')) >>> _ = s.add_bond(QueryBond('Double'), c, o1) >>> _ = s.add_bond(QueryBond('Single'), c, o2) >>> _ = s.add_bond(QueryBond('Single'), o2, h) ''' if isinstance(bond, molecule.Bond.BondType): sub_bond = SubstructureSearchLib.SubstructureBond(bond._bond_type) bond = QueryBond(_substructure_bond=sub_bond) elif isinstance(bond, molecule.Bond): sub_bond = SubstructureSearchLib.SubstructureBond(bond.bond_type._bond_type) bond = QueryBond(_substructure_bond=sub_bond) elif isinstance(bond, QueryBond): pass elif isinstance(bond, str): if bond.lower() == 'any': bond = QueryBond() else: ty = molecule.Bond.BondType(bond)._bond_type sub_bond = SubstructureSearchLib.SubstructureBond(ty) bond = QueryBond(_substructure_bond=sub_bond) elif isinstance(bond, int): ty = ChemistryLib.BondType(bond) sub_bond = SubstructureSearchLib.SubstructureBond(ty) bond = QueryBond(_substructure_bond=sub_bond) else: raise TypeError('Improper argument to add_bond(%s)' % bond) if atom1 is None: atom1 = QueryAtom() if isinstance(atom1, molecule.Atom): atom1 = self.add_atom(atom1) elif isinstance(atom1, str): atom1 = self.add_atom(atom1) if atom1.index is None: atom1 = self.add_atom(atom1) if atom2 is None: atom2 = QueryAtom() if isinstance(atom2, molecule.Atom): atom2 = self.add_atom(atom2) elif isinstance(atom2, str): atom2 = self.add_atom(atom2) if atom2.index is None: atom2 = self.add_atom(atom2) self._substructure.add( bond._substructure_bond, atom1.index, atom2.index ) return bond
@property def bonds(self): '''The bonds in the substructure. >>> s = QuerySubstructure() >>> b = s.add_bond('Single', QueryAtom('C'), QueryAtom('F')) >>> bonds = s.bonds >>> print(bonds[0]) QueryBond(Single) ''' return [ QueryBond(_substructure_bond=self._substructure.bond(i)) for i in range(self._substructure.nbonds()) ]
[docs] def write_xml(self, file_name): '''Write an XML representation of the substructure. Deprecated. :param fname: path to XML file ''' warnings.warn('''This method is deprecated and will be removed in a later version.''', DeprecationWarning) w = SubstructureSearchLib.XMLSubstructureWriter() ostr = UtilitiesLib.ofstream(file_name) opts = SubstructureSearchLib.XMLSubstructureOptions() w.write( self._substructure, opts, SubstructureSearchLib.XMLSubstructureWriter.SUBSTRUCTURE_SEARCH, ostr ) ostr.close()
[docs] def match_atom(self, atom, query_atom=None): '''Whether or not the given atom matches the query_atom in the given context. :param atom: a :class:`ccdc.molecule.Atom` instance. :param query_atom: a :class:`ccdc.search.QueryAtom` instance or ``None``. If ``None``, the first atom of the substructure will be used. :returns: bool >>> s = QuerySubstructure() >>> _ = s.add_bond('Single', QueryAtom('Cl'), QueryAtom('C')) >>> mol = EntryReader('csd').molecule('AABHTZ') >>> s.match_atom(mol.atom('Cl1')) True >>> s.match_atom(mol.atom('C1')) False >>> s.match_atom(mol.atom('C1'), s.atoms[1]) True ''' if query_atom is None: index = 0 else: index = query_atom.index matcher = SubstructureSearchLib.SubstructureMoleculeGraphSearch( self._substructure, SubstructureSearchLib.SubstructureMoleculeMatchCriteria() ) ct = matcher.find_matches( atom._atom.molecule(), {index: atom.index} ) return bool(ct)
[docs] def nmatch_molecule(self, molecule): '''Returns number of query matches within the specified molecule. :param molecule: a :class:`ccdc.molecule.Molecule` instance. :returns: integer >>> s = QuerySubstructure() >>> _ = s.add_bond('Single', QueryAtom('Cl'), QueryAtom('C')) >>> mol = EntryReader('csd').molecule('AABHTZ') >>> s.nmatch_molecule(mol) 2 ''' return len([a for a in molecule.atoms if self.match_atom(a)])
[docs] def match_molecule(self, molecule): '''Whether or not the query matches the specified molecule. :param molecule: a :class:`ccdc.molecule.Molecule` instance. :returns: bool >>> s = QuerySubstructure() >>> _ = s.add_bond('Double', QueryAtom('C'), QueryAtom('O')) >>> mol = EntryReader('csd').molecule('AABHTZ') >>> s.match_molecule(mol) True ''' matcher = SubstructureSearchLib.SubstructureMoleculeGraphSearch( self._substructure, SubstructureSearchLib.SubstructureMoleculeMatchCriteria() ) return bool(matcher.find_matches(molecule._molecule))
###################################################################################
[docs]class SMARTSSubstructure(QuerySubstructure): '''Make a substructure from a SMARTS string. Let us create a ketone SMARTSSubstructure as an example. >>> smarts_query = SMARTSSubstructure("[CD4][CD3](=[OD1])[CD4]") >>> print(smarts_query.smarts) [CD4][CD3](=[OD1])[CD4] There is a minor extension to Daylight SMARTS to allow the representation of quadruple, delocalised and pi bonds, using the characters '_', '"' and '|' respectively. There is a second minor extension to allow easy access to the indices of the atoms. >>> query = SMARTSSubstructure("[#6:0]([#7]-H)[#8:1][#6:2]") >>> print(query.label_to_atom_index(0)) 0 >>> print(query.label_to_atom_index(1)) 3 ''' def __init__(self, smarts): '''Initialise a SMARTS query with a string.''' self._reader = SubstructureSearchLib.SMARTSSubstructureReader() self.smarts = smarts QuerySubstructure.__init__(self, _substructure=self._substructure) @property def smarts(self): '''The SMARTS string.''' return self._smarts
[docs] def label_to_atom_index(self, label): '''Translate a SMARTS label into the appropriate substructure atom index''' x = self._reader.label_to_atom(str(label)) if not x: raise KeyError(f"No atom with label {label}") return x.index()
@smarts.setter def smarts(self, smarts): '''Ensure _substructure is updated.''' self._smarts = smarts self._substructure = self._reader.substructure(self._smarts) self.measurements = []
###################################################################################
[docs]class MoleculeSubstructure(QuerySubstructure): '''Make a substructure query from an entire molecule. Can be used to search for exact matches of a molecule when appropraite num_bonds or add_connected_element_count constraints are set on the QueryAtoms. Furthermore if hydrogen atoms have been removed from the molecule used to initialise the MoleculeSubstructure it can be used to find hits that match the heavy atoms as a substructure. :param mol: :class:`ccdc.molecule.Molecule` :param match_stereochemistry: Should the substructure constrain target stereochemistry to match the input molecule's stereochemistry? :raises: TypeError if the passed in molecule has multiple components since multi-component molecule substructure searches are not supported. The components should be added as separate substructures. >>> mol = EntryReader('csd').molecule('AABHTZ') >>> sub = MoleculeSubstructure(mol) ''' def __init__(self, mol, match_stereochemistry=False): '''Initialise a MoleculeSubstructure with a molecule. ''' if len(mol.components) > 1: raise TypeError('Multi-component molecule substructures are not supported') stereo = SubstructureSearchLib.Substructure.MATCH_STEREOCHEMISTRY if match_stereochemistry else SubstructureSearchLib.Substructure.NO_STEREOCHEMISTRY substructure = SubstructureSearchLib.Substructure.instantiate(mol._molecule, stereo) QuerySubstructure.__init__(self, _substructure=substructure)
###################################################################################
[docs]class ConnserSubstructure(QuerySubstructure): '''Read a Conquest query language file.''' required_content = re.compile(r'\*CONN', re.IGNORECASE) def __init__(self, file_name, _conn=None): '''Read the file. :param file_name: path to the Connser file :raises: IOError if the file cannot be read or if it is empty or if it does not contain '*CONN' ''' if _conn is None: try: f = open(file_name) except: raise IOError('File cannot be read: %s' % file_name) else: txt = f.read() f.close() if not txt or self.required_content.search(txt) is None: raise IOError('File is not a connser file: %s' % file_name) self._conn = SubstructureSearchLib.ConnserFile(file_name) self.name = os.path.splitext(os.path.basename(file_name))[0] else: self._conn = _conn self.name = 'string' substructure = self._conn.substructure() QuerySubstructure.__init__(self, _substructure=substructure)
[docs] def interaction_library_contact_atoms(self): '''Provide the list of indexes of atoms into the substructure (optionally) defined in the ConnSer query for generating the data in the CCDC interaction library The list of indexes are into the list of substructure atoms with the associated substructure see :mod:`ccdc.interactions` for more information on the interaction library ''' return self._conn.isostar_contact_atom_indexes()
[docs] @staticmethod def from_string(text): '''Create a substructure from a textual representation of a Connser file.''' _conn = SubstructureSearchLib.ConnserFile() stream = UtilitiesLib.istringstream(str(text)) _conn.read(stream) return ConnserSubstructure('string', _conn=_conn)
########################################################################### class XMLSubstructure(QuerySubstructure): '''A :class:`ccdc.search.QuerySubstructure` read from an XML file. Deprecated.''' def __init__(self, fname): '''Initialise from an XML formatted file. Deprecated. :param fname: path to XML file ''' warnings.warn('''This class is deprecated and will be removed in a later version.''', DeprecationWarning) if not os.path.exists(fname): raise IOError('The file %s does not exist' % fname) QuerySubstructure.__init__(self) reader = SubstructureSearchLib.XMLSubstructureReader() reader.load(fname) self._substructure = reader.substructure(0) ########################################################################### # Searches ###########################################################################
[docs]class SimilaritySearch(Search): '''Class to define and run similarity searches.'''
[docs] class Settings(Search.Settings): coeffs = utilities.bidirectional_dict( dice=SubstructureSearchLib.DICE, tanimoto=SubstructureSearchLib.TANIMOTO, ) _sort_order = utilities.bidirectional_dict( value_order=SubstructureSearchLib.VALUE_ORDER, alphabetic_order=SubstructureSearchLib.ALPHABETIC_ORDER ) '''Settings for a similarity search.''' def __init__(self, threshold=0.7, coefficient='tanimoto', _settings=None): if _settings is None: if threshold is None: threshold = 0.7 if coefficient is None: coefficient = 'tanimoto' _settings = CSDSQLDatabaseLib.SimilaritySearchSettings( self.coeffs.prefix_lookup(coefficient), threshold ) self._settings = _settings super(SimilaritySearch.Settings, self).__init__(_settings=self._settings) @property def threshold(self): '''The similarity threshold to apply. This is a value between 0.0 and 1.0. ''' return self._settings.threshold() @threshold.setter def threshold(self, value): self._settings.set_threshold(value) @property def coefficient(self): '''This should be either 'dice' or 'tanimoto', the default. ''' return self.coeffs.inverse_lookup(self._settings.coefficient()) @coefficient.setter def coefficient(self, value): if isinstance(value, str): self._settings.set_coefficient(self.coeffs.prefix_lookup(value)) else: self._settings.set_coefficient(value) @property def sort_order(self): '''The order in which hits will be sorted. THis should be either 'alphabetic' or 'value', the default. ''' return self._sort_order.inverse_lookup(self._settings.sort_order) @sort_order.setter def sort_order(self, value): self._settings.set_sort_order(self._sort_order.prefix_lookup(value))
[docs] class SimilarityHit(Search.SearchHit): '''A search hit recording the similarity measure. The SimilarityHit instance will give access to the identifier of the hit, the value of the similarity to the query molecule, the entry, crystal or molecule of the hit. ''' def __init__(self, similarity, identifier, _database=None, _entry=None, _crystal=None, _molecule=None, _binary_database=None): Search.SearchHit.__init__( self, identifier, _database=_database, _entry=_entry, _crystal=_crystal, _molecule=_molecule, _binary_database=_binary_database) self.similarity = similarity self.identifier = identifier
def __init__(self, mol=None, threshold=0.7, coefficient='tanimoto', settings=None): '''Save the threshold and instantiate the databases. :param mol: :class:`ccdc.molecule.Molecule` or :class:`ccdc.search.QuerySubstructure` :param threshold: float (0.0 to 1.0) :param coefficient: one of 'tanimoto' or 'dice' ''' if settings is None: settings = SimilaritySearch.Settings(threshold, coefficient) self.settings = settings # ignoring threshold and coeff if settings is provided self.molecule = mol @property def molecule(self): '''The query molecule.''' return self._molecule @molecule.setter def molecule(self, mol): self._molecule = mol if mol is None: # then it'll have to be provided later self._substructure = None elif isinstance(mol, QuerySubstructure): self._fp = SubstructureSearchLib.ChemicalFingerprintBuilderSubstructure() self._sp = self._fp.similarity_fingerprint(mol._substructure) self._substructure = mol._substructure else: self._fp = SubstructureSearchLib.ChemicalFingerprintBuilderMolecule() self._sp = self._fp.similarity_fingerprint(mol._molecule) self._substructure = SubstructureSearchLib.Substructure.instantiate(mol._molecule)
[docs] @staticmethod def from_xml(xml): '''Create a SimilaritySearch from an XML representation. :param xml: XML string ''' stream = UtilitiesLib.istringstream(xml) reader = SubstructureSearchLib.XMLSubstructureReader() reader.load(stream) try: coeff = SimilaritySearch.Settings.coeffs.inverse_lookup( reader.options().similarity_coefficient() ) except RuntimeError: coeff = None try: thresh = reader.options().similarity_threshold() except RuntimeError: thresh = None q = QuerySubstructure(_substructure=reader.substructure(0)) return SimilaritySearch(q, thresh, coeff)
[docs] @staticmethod def from_xml_file(file_name): '''Create a SimilaritySearch from an XML file. :param file_name: path to XML file :raises: IOError when the file does not exist ''' if not os.path.exists(file_name): raise IOError('The file %s does not exist' % file_name) with open(file_name) as f: return SimilaritySearch.from_xml(f.read())
[docs] def read_xml(self, xml): '''Read a query from an an XML representation. :param xml: XML string ''' stream = UtilitiesLib.istringstream(xml) reader = SubstructureSearchLib.XMLSubstructureReader() reader.load(stream) try: self.settings.coefficient = SimilaritySearch.Settings.coeffs.inverse_lookup( reader.options().similarity_coefficient() ) except RuntimeError: pass try: self.settings.threshold = reader.options().similarity_threshold() except RuntimeError: pass sub = reader.substructure(0) self._substructure = sub self._molecule = None
[docs] def read_xml_file(self, file_name): '''Read an XML file into the similarity searcher. :param file_name: path to XML file :raises: IOError if the file cannot be read ''' if not os.path.exists(file_name): raise IOError('The file %s does not exist' % file_name) with open(file_name) as f: self.read_xml(f.read())
@property def threshold(self): '''The similarity threshold to use.''' return self.settings.threshold @threshold.setter def threshold(self, value): '''Sets the value of threshold.''' self.settings.threshold = value @property def coefficient(self): '''Which coefficient to use when determining similarity.''' return self.settings.coefficient @coefficient.setter def coefficient(self, value): self.settings.coefficient = value
[docs] def search_molecule(self, mol): '''Search a molecule. This can be used to determine a similarity coefficient against the given molecule. :param mol: :class:`ccdc.molecule.Molecule` :returns: :class:`SimilaritySearch.SimilarityHit` >>> csd = EntryReader('csd') >>> ibuprofen = csd.molecule('HXACAN') >>> searcher = SimilaritySearch(ibuprofen) >>> hit = searcher.search_molecule(csd.molecule('IBPRAC')) >>> print(round(hit.similarity, 3)) 0.161 ''' fp = self._fp.similarity_fingerprint(mol._molecule) if self.settings.coefficient.lower() == 'dice': coeff = self._sp.dice(fp) else: coeff = self._sp.tanimoto(fp) return SimilaritySearch.SimilarityHit(coeff, mol.identifier, _molecule=mol)
def _search_reader(self, reader): self.settings._settings.reset_hits() if not hasattr(reader, '_similarity_searcher'): reader._similarity_searcher = reader._db.searcher_factory().similarity_searcher() if self.settings._has_filter_set(): max_hits = self.settings.max_hit_structures if max_hits != maxint32: self.settings.max_hit_structures = maxint32 results = reader._similarity_searcher.search(self._substructure, self.settings._settings) hits = list() for r in results: if max_hits and len(hits) >= max_hits: break h = SimilaritySearch.SimilarityHit( r.similarity(), r.identifier().str(), _binary_database=reader._db) if self.settings.test(h.entry): hits.append(h) self.settings.max_hit_structures = max_hits else: results = reader._similarity_searcher.search(self._substructure, self.settings._settings) hits = list( SimilaritySearch.SimilarityHit(r.similarity(), r.identifier().str(), _binary_database=reader._db) for r in results ) return hits def _search_entry(self, entry): if self.settings.test(entry): try: mol = entry.molecule except TypeError: return [] return self._search_molecule(mol) return [] def _search_crystal(self, crystal): if self.settings.test(crystal): try: mol = crystal.molecule except TypeError: return [] return self._search_molecule(mol) return [] def _search_molecule(self, mol): if self.settings.test(mol): fp = self._fp.similarity_fingerprint(mol._molecule) if self.settings.coefficient.lower() == 'dice': coeff = self._sp.dice(fp) else: coeff = self._sp.tanimoto(fp) if coeff >= self.settings.threshold: return [SimilaritySearch.SimilarityHit(coeff, mol.identifier, _molecule=mol)] return []
###################################################################################
[docs]class TextNumericSearch(Search): '''Class to define and run text/numeric searches in a crystal structure database. It is possible to add one or more criterion for the query to match. >>> text_numeric_query = TextNumericSearch() >>> text_numeric_query.add_compound_name('aspirin') >>> text_numeric_query.add_citation(year=[2011, 2013]) >>> for hit in text_numeric_query.search(max_hit_structures=3): ... print(hit.identifier) ... ACSALA19 ACSALA20 ACSALA21 A human-readable representation of the queries may be obtained: >>> print(', '.join(q for q in text_numeric_query.queries)) Compound name aspirin anywhere , Journal year in range 2011-2013 ''' modes = utilities.bidirectional_dict( anywhere=DatabaseEntryLib.ANYWHERE, exact=DatabaseEntryLib.EXACT_WORD, separate=DatabaseEntryLib.EXACT_SPACE_SEPARATED_WORD, is_null=DatabaseEntryLib.IS_NULL, not_null=DatabaseEntryLib.NOT_NULL, start_of_word=DatabaseEntryLib.START_OF_WORD, start=DatabaseEntryLib.STARTS_WITH, ) _numeric_fields = dict(( (DatabaseEntryLib.CCDC_JOURNAL_CODEN, 'Journal identifier'), (DatabaseEntryLib.JOURNAL_YEAR, 'Journal year'), (DatabaseEntryLib.CCDC_DEPOSITION_NUMBER, 'CCDC number'), (DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_DYNAMIC_DISORDER, 'Predicted semiconductor dynamic disorder'), (DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_SINGLET_STATE_1_ENERGY, 'Predicted semiconductor singlet state 1 energy'), (DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_SINGLET_STATE_2_ENERGY, 'Predicted semiconductor singlet state 2 energy'), (DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_TRIPLET_STATE_1_ENERGY, 'Predicted semiconductor triplet state 1 energy'), (DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_TRIPLET_STATE_2_ENERGY, 'Predicted semiconductor triplet state 2 energy'), (DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_REORGANIZATION_ENERGY, 'Predicted semiconductor hole reorganization energy'), (DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_TRANSFER_INTEGRAL, 'Predicted semiconductor transfer integral'), (DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_HOMO_LUMO_GAP, 'Predicted semiconductor HOMO-LUMO gap'), (DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_SINGLET_STATE_1_OSCILLATOR_STRENGTH, 'Predicted semiconductor singlet state 1 oscillator strength'), (DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_SINGLET_STATE_2_OSCILLATOR_STRENGTH, 'Predicted semiconductor singlet state 2 oscillator strength'), )) _text_fields = dict(( (DatabaseEntryLib.ALL_TEXT, 'All text'), (DatabaseEntryLib.ANALOGUES, 'Analogues'), (DatabaseEntryLib.AUTHOR_NAME, 'Author'), (DatabaseEntryLib.BIOACTIVITY, 'Bioactivity'), (DatabaseEntryLib.COLOR, 'Color'), (DatabaseEntryLib.COMPOUND_NAME, 'Compound name'), (DatabaseEntryLib.DISORDER, 'Disorder'), (DatabaseEntryLib.DOI, 'DOI'), (DatabaseEntryLib.HABIT, 'Habit'), (DatabaseEntryLib.JDS_DEPOSITION_NUMBER, 'JDS deposition number'), (DatabaseEntryLib.JOURNAL_PAGE, 'Journal page'), (DatabaseEntryLib.JOURNAL_VOLUME, 'Journal volume'), (DatabaseEntryLib.PEPTIDE_SEQUENCE, 'Peptide sequence'), (DatabaseEntryLib.PHASE_TRANSITIONS, 'Phase transitions'), (DatabaseEntryLib.POLYMORPH, 'Polymorph'), (DatabaseEntryLib.RECRYSTALLISATION_SOLVENT, 'Recrystallisation solvent'), (DatabaseEntryLib.REFCODE, 'All refcodes'), (DatabaseEntryLib.MAIN_REFCODE_ONLY, 'Refcode'), (DatabaseEntryLib.SOURCE, 'Source'), (DatabaseEntryLib.SPACEGROUP_NAME, 'Spacegroup'), (DatabaseEntryLib.SYNONYMS, 'Synonyms'), (DatabaseEntryLib.HEAT_CAPACITY_NOTES, 'Heat capacity notes'), (DatabaseEntryLib.HEAT_OF_FUSION_NOTES, 'Heat of fusion notes'), (DatabaseEntryLib.SOLUBILITY_NOTES, 'Solubility notes'), ))
[docs] class TextNumericSearchSettings(Search.Settings): '''No settings apart from those provided by the base class required.'''
[docs] class TextNumericHit(Search.SearchHit): '''Hit from a TextNumericSearch.''' def __init__(self, identifier, _db): '''Store identifier and database''' Search.SearchHit.__init__(self, identifier, _binary_database=_db)
def __init__(self, settings=None): '''Initialise a text-numeric query.''' if settings is None: settings = TextNumericSearch.Settings() self.settings = settings self.clear() self._journal_list = None
[docs] def clear(self): '''Restart a search.''' self._search = DatabaseEntryLib.CrystalStructureDatabaseTextNumericSearch()
def _text_query(self, field, txt, mode='anywhere', ignore_non_alpha_num=False): '''Private: construct a text query.''' lower_mode = mode.lower() if txt or lower_mode == 'is_null' or lower_mode == 'not_null': query = DatabaseEntryLib.CrystalStructureDatabaseTextSearchQuery( field, self.modes[lower_mode], txt ) query.set_option(DatabaseEntryLib.IGNORE_NON_ALPHABETIC_CHARS, ignore_non_alpha_num) self._search.add_query(query) def _numeric_query(self, field, value): '''Private: construct a numeric query.''' if isinstance(value, list) or isinstance(value, tuple): cond = SubstructureSearchLib.InclusiveRange(value[0], value[1]) else: cond = SubstructureSearchLib.EqualTo(value) query = DatabaseEntryLib.CrystalStructureDatabaseNumericSearchQuery( field, cond ) self._search.add_query(query) def _text_queries(self): '''Private: the tuple of formatted text queries.''' def format_query(q): return '%s %s %s %s' % ( self._text_fields[q.field()], q.value(), self.modes.inverse_lookup(q.match_type()), 'ignore non-alphanumeric' if q.is_option_set(0) else '' ) return tuple( format_query(q) for q in self._search.text_queries() ) def _numeric_queries(self): '''Private: the tuple of formatted numeric queries.''' def format_query(q): return '%s %s %s' % (self._numeric_fields[q.field()], q.condition().name(), q.condition()) return tuple( format_query(q) for q in self._search.numeric_queries() ) @property def queries(self): '''The current set of queries for this search. >>> tns = TextNumericSearch() >>> tns.add_all_text('ibuprofen') >>> tns.add_author('Haisa') >>> print('; '.join(str(q).strip() for q in tns.queries)) All text ibuprofen anywhere; Author Haisa anywhere ''' return self._text_queries() + self._numeric_queries()
[docs] def add_all_text(self, txt, mode='anywhere', ignore_non_alpha_num=False): '''Search for text anywhere in the entry.''' self._text_query(DatabaseEntryLib.ALL_TEXT, txt, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_analogue(self, analogue, mode='anywhere', ignore_non_alpha_num=False): '''Search for an analogue.''' self._text_query(DatabaseEntryLib.ANALOGUES, analogue, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_author(self, author, mode='anywhere', ignore_non_alpha_num=False): '''Search for an author.''' self._text_query(DatabaseEntryLib.AUTHOR_NAME, author, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_bioactivity(self, activity, mode='anywhere', ignore_non_alpha_num=False): '''Search for a particular bio-activity.''' self._text_query(DatabaseEntryLib.BIOACTIVITY, activity, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_color(self, color, mode='anywhere', ignore_non_alpha_num=False): '''Search for a particular colour.''' self._text_query(DatabaseEntryLib.COLOR, color, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_compound_name(self, compound_name, mode='anywhere', ignore_non_alpha_num=False): '''Search for a compound name. The search checks the content both of :attr:`ccdc.entry.Entry.chemical_name` and :attr:`ccdc.entry.Entry.synonyms`. To illustrate this let us have a look at the CSD entry ``ABABEM``. >>> from ccdc.io import EntryReader >>> entry_reader = EntryReader('CSD') >>> ababem = entry_reader.entry('ABABEM') >>> print(ababem.chemical_name) Tetrahydro[1,3,4]thiadiazolo[3,4-a]pyridazine-1,3-dione >>> print(ababem.synonyms[0]) 8-Thia-1,6-diazabicyclo[4.3.0]nonane-7,9-dione The text ``azabicyclo[4.3.0]nonane`` is only found in the synonym. Let us search for it using a compound name search. >>> from ccdc.search import TextNumericSearch >>> query = TextNumericSearch() >>> query.add_compound_name('azabicyclo[4.3.0]nonane') >>> hits = query.search() Finally let us assert that we have found ``ABABEM``. >>> assert(u'ABABEM' in [h.identifier for h in hits]) ''' self._text_query(DatabaseEntryLib.COMPOUND_NAME, compound_name, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_disorder(self, disorder, mode='anywhere', ignore_non_alpha_num=False): '''Search for a disorder comment.''' self._text_query(DatabaseEntryLib.DISORDER, disorder, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_doi(self, doi, mode='anywhere', ignore_non_alpha_num=False): '''Search for a DOI.''' self._text_query(DatabaseEntryLib.DOI, doi, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_habit(self, habit, mode='anywhere', ignore_non_alpha_num=False): '''Search for a particular habit.''' self._text_query(DatabaseEntryLib.HABIT, habit, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_peptide_sequence(self, peptide_sequence, mode='anywhere', ignore_non_alpha_num=False): '''Search for a peptide sequence.''' self._text_query(DatabaseEntryLib.PEPTIDE_SEQUENCE, peptide_sequence, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_phase_transition(self, phase_transition, mode='anywhere', ignore_non_alpha_num=False): '''Search for a phase transition.''' self._text_query(DatabaseEntryLib.PHASE_TRANSITIONS, phase_transition, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_polymorph(self, polymorph, mode='anywhere', ignore_non_alpha_num=False): '''Search for polymorph information.''' self._text_query(DatabaseEntryLib.POLYMORPH, polymorph, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_solvent(self, solvent, mode='anywhere', ignore_non_alpha_num=False): '''Search for a solvent.''' self._text_query(DatabaseEntryLib.RECRYSTALLISATION_SOLVENT, solvent, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_identifier(self, refcode, mode='anywhere', ignore_non_alpha_num=False): '''Search for a refcode.''' self._text_query(DatabaseEntryLib.MAIN_REFCODE_ONLY, refcode, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_all_identifiers(self, refcode, mode='anywhere', ignore_non_alpha_num=False): '''Search for an identifier, including previous identifiers. >>> from ccdc.search import TextNumericSearch >>> query = TextNumericSearch() >>> query.add_all_identifiers('DABHUJ') >>> hits = query.search() >>> print(hits[0].identifier) ACPRET03 >>> print(hits[0].entry.previous_identifier) DABHUJ ''' self._text_query(DatabaseEntryLib.REFCODE, refcode, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_source(self, source, mode='anywhere', ignore_non_alpha_num=False): '''Search for a source. >>> from ccdc.search import TextNumericSearch >>> searcher = TextNumericSearch() >>> searcher.add_source('toad') >>> hits = searcher.search(max_hit_structures=5) >>> for h in hits: ... print('%-8s: %s' % (h.identifier, h.entry.source)) ... CUXYAV : Ch'an Su (dried venom of Chinese toad) EWAWUW : isolated from the eggs of toad Bufo bufo gargarizans EWAXAD : isolated from the eggs of toad Bufo bufo gargarizans FIFDUT : dried venom of Chinese toad Ch'an Su FIFFAB : dried venom of Chinese toad Ch'an Su ''' self._text_query(DatabaseEntryLib.SOURCE, source, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_spacegroup_symbol(self, spacegroup_symbol, mode='anywhere', ignore_non_alpha_num=False): '''Search for a spacegroup symbol or any alias of that symbol.''' self._text_query(DatabaseEntryLib.SPACEGROUP_NAME, spacegroup_symbol, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_synonym(self, synonym, mode='anywhere', ignore_non_alpha_num=False): '''Search for a synonym.''' self._text_query(DatabaseEntryLib.SYNONYMS, synonym, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_citation(self, author='', journal='', volume=None, year=None, first_page=None, ignore_non_alpha_num=False, _coden=None): '''Search for a citation. Note: the journal parameter requires the CSD to be present in order to translate the journal name to a coden identifier. If the CSD is not present, but an alternative database is, use the alternative database's journals dict to look up a coden identifier and specify the _coden parameter in this function.''' if author: self.add_author(author) coden = None if _coden is not None: coden = _coden elif journal: coden = self.journals.get(journal, None) if coden is None: raise NameError('The journal %s could not be found' % journal) if coden is not None: self._numeric_query(DatabaseEntryLib.CCDC_JOURNAL_CODEN, coden) if volume is not None: self._text_query(DatabaseEntryLib.JOURNAL_VOLUME, str(volume), 'exact', ignore_non_alpha_num=ignore_non_alpha_num) if year is not None: self._numeric_query(DatabaseEntryLib.JOURNAL_YEAR, year) if first_page is not None: self._text_query(DatabaseEntryLib.JOURNAL_PAGE, str(first_page), 'exact', ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_ccdc_number(self, value): '''Search for a particular or a range of CCDC deposition numbers. >>> from ccdc.search import TextNumericSearch >>> searcher = TextNumericSearch() >>> searcher.add_ccdc_number(241370) >>> hits = searcher.search() >>> len(hits) 1 >>> entry = hits[0].entry >>> print('%s %s' % (entry.identifier, entry.ccdc_number)) ABEBUF 241370 >>> searcher.clear() >>> searcher.add_ccdc_number((241368, 241372)) >>> hits = searcher.search() >>> print(len(hits)) 3 >>> for hit in hits: ... print('%s %s' % (hit.identifier, hit.entry.ccdc_number)) ... ABEBUF 241370 BIBZIW 241371 BIMGEK 241372 ''' self._numeric_query(DatabaseEntryLib.CCDC_DEPOSITION_NUMBER, value)
[docs] def add_heat_capacity_notes(self, heat_capacity_notes, mode='anywhere', ignore_non_alpha_num=False): '''Search for heat capacity notes.''' SolubilityPlatformLib.SolventData(heat_capacity_notes, 0) self._text_query(DatabaseEntryLib.HEAT_CAPACITY_NOTES, heat_capacity_notes, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_heat_of_fusion_notes(self, heat_of_fusion_notes, mode='anywhere', ignore_non_alpha_num=False): '''Search for heat of fusion notes.''' SolubilityPlatformLib.SolventData(heat_of_fusion_notes, 0) self._text_query(DatabaseEntryLib.HEAT_OF_FUSION_NOTES, heat_of_fusion_notes, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
[docs] def add_solubility_notes(self, solubility_notes, mode='anywhere', ignore_non_alpha_num=False): '''Search for solubility notes.''' SolubilityPlatformLib.SolventData(solubility_notes, 0) self._text_query(DatabaseEntryLib.SOLUBILITY_NOTES, solubility_notes, mode=mode, ignore_non_alpha_num=ignore_non_alpha_num)
def _add_fiz_depostion_number(self, value): '''Private.''' self._numeric_query(DatabaseEntryLib.FIZ_DEPOSITION_NUMBER, value) def _add_csd_accession_date(self, value): '''Private.''' self._numeric_query(DatabaseEntryLib.CSD_ACCESSION_DATE, value) def _add_csd_modification_date(self, value): '''Private.''' self._numeric_query(DatabaseEntryLib.CSD_MODIFICATION_DATE, value) def _add_entry_insertion_time(self, value): '''Private.''' self._numeric_query(DatabaseEntryLib.ENTRY_INSERTION_TIME, value)
[docs] def add_predicted_semiconductor_dynamic_disorder(self, value): '''Search for predicted semiconductor dynamic disorder. See :attr:`ccdc.entry.SemiconductorPredictedProperties.dynamic_disorder` ''' self._numeric_query(DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_DYNAMIC_DISORDER, value)
[docs] def add_predicted_semiconductor_singlet_state_1_energy(self, value): '''Search for predicted semiconductor singlet state 1 energy. See :attr:`ccdc.entry.SemiconductorPredictedProperties.singlet_state_1_energy` ''' self._numeric_query(DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_SINGLET_STATE_1_ENERGY, value)
[docs] def add_predicted_semiconductor_singlet_state_2_energy(self, value): '''Search for predicted semiconductor singlet state 2 energy. See :attr:`ccdc.entry.SemiconductorPredictedProperties.singlet_state_2_energy` ''' self._numeric_query(DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_SINGLET_STATE_2_ENERGY, value)
[docs] def add_predicted_semiconductor_triplet_state_1_energy(self, value): '''Search for predicted semiconductor triplet state 1 energy. See :attr:`ccdc.entry.SemiconductorPredictedProperties.triplet_state_1_energy` ''' self._numeric_query(DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_TRIPLET_STATE_1_ENERGY, value)
[docs] def add_predicted_semiconductor_triplet_state_2_energy(self, value): '''Search for predicted semiconductor triplet state 2 energy. See :attr:`ccdc.entry.SemiconductorPredictedProperties.triplet_state_2_energy` ''' self._numeric_query(DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_TRIPLET_STATE_2_ENERGY, value)
[docs] def add_predicted_semiconductor_hole_reorganization_energy(self, value): '''Search for predicted semiconductor hole reorganization energy. See :attr:`ccdc.entry.SemiconductorPredictedProperties.hole_reorganization_energy` ''' self._numeric_query(DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_REORGANIZATION_ENERGY, value)
[docs] def add_predicted_semiconductor_transfer_integral(self, value): '''Search for predicted semiconductor transfer integral. See :attr:`ccdc.entry.SemiconductorPredictedProperties.transfer_integral` ''' self._numeric_query(DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_TRANSFER_INTEGRAL, value)
[docs] def add_predicted_semiconductor_homo_lumo_gap(self, value): '''Search for predicted semiconductor HOMO-LUMO gap. See :attr:`ccdc.entry.SemiconductorPredictedProperties.homo_lumo_gap` ''' self._numeric_query(DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_HOMO_LUMO_GAP, value)
[docs] def add_predicted_semiconductor_singlet_state_1_oscillator_strength(self, value): '''Search for predicted semiconductor singlet state 1 oscillator strength. See :attr:`ccdc.entry.SemiconductorPredictedProperties.singlet_state_1_oscillator_strength` ''' self._numeric_query(DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_SINGLET_STATE_1_OSCILLATOR_STRENGTH, value)
[docs] def add_predicted_semiconductor_singlet_state_2_oscillator_strength(self, value): '''Search for predicted semiconductor singlet state 2 oscillator strength. See :attr:`ccdc.entry.SemiconductorPredictedProperties.singlet_state_2_oscillator_strength` ''' self._numeric_query(DatabaseEntryLib.PREDICTED_SEMICONDUCTOR_SINGLET_STATE_2_OSCILLATOR_STRENGTH, value)
[docs] def is_journal_valid(self, journal): '''Check the validity of a specified journal name in the CSD. This requires the CSD to be present. :param journal: str, journal name''' return self.journals.get(journal) is not None
@property def journals(self): '''A dictionary of journal name : ccdc code number for journals in the CSD. This requires the CSD to be present. ''' if self._journal_list is None: _binary_db = CSDSQLDatabaseLib.CSDSQLDatabase( _CSDDatabaseLocator.get_binary_csd_location() ) self._journal_list = { j.name(): j.ccdc_coden() for j in _binary_db.journal_list_info().journal_list() } return self._journal_list def _search_reader(self, reader): self._search.settings().hits_limit_manager().reset_hits() if not hasattr(reader, '_text_numeric_searcher'): try: reader._text_numeric_searcher = reader._db.searcher_factory().text_numeric_searcher() except (RuntimeError, NameError, AttributeError): pass if not hasattr(reader, '_text_numeric_searcher'): raise NotImplementedError('This database does not support TextNumericSearch') if self.settings._has_filter_set(): max_hits = self.settings.max_hit_structures self._search.settings().set_maximum_hits_limit(maxint32) ids = CSDSQLDatabaseLib.text_numeric_search(reader._text_numeric_searcher, self._search) self.settings.max_hit_structures = max_hits l = list() for i, x in enumerate(ids): hit = TextNumericSearch.TextNumericHit(x, reader._db) if self.settings.test(hit.entry): l.append(hit) if max_hits and len(l) >= max_hits: break else: self._search.settings().set_maximum_hits_limit(self.settings.max_hit_structures) ids = CSDSQLDatabaseLib.text_numeric_search(reader._text_numeric_searcher, self._search) l = list( TextNumericSearch.TextNumericHit(x, reader._db) for x in ids ) return l def _search_entry(self, entry): raise NotImplementedError('TextNumericSearch of an entry') def _search_crystal(self, crystal): raise NotImplementedError('TextNumericSearch of a crystal') def _search_molecule(self, molecule): raise NotImplementedError('TextNumericSearch of a molecule')
[docs] @staticmethod def from_xml(xml): '''Create a TextNumericSearch from XML. :param xml: XML string ''' stream = UtilitiesLib.istringstream(xml) parser = DatabaseEntryLib.TextNumericSearchXMLParser() tns = TextNumericSearch() tns._search = parser.parse(stream) return tns
[docs] @staticmethod def from_xml_file(file_name): '''Create a TextNumericSearch from an XML file. :param file_name: path to XML file :raises: IOError when the file does not exist ''' if not os.path.exists(file_name): raise IOError('The file %s does not exist' % file_name) with open(file_name) as f: return TextNumericSearch.from_xml(f.read())
[docs] def read_xml(self, xml): '''Read a query from XML. :param xml: XML string ''' stream = UtilitiesLib.istringstream(xml) parser = DatabaseEntryLib.TextNumericSearchXMLParser() self._search = parser.parse(stream)
[docs] def read_xml_file(self, file_name): '''Read a text numeric search from an XML file. :param file_name: path to XML file :raises: IOError if the file cannot be read ''' if not os.path.exists(file_name): raise IOError('The file %s does not exist' % file_name) with open(file_name) as f: self.read_xml(f.read())
###########################################################################
[docs]class SubstructureSearch(Search): '''Query crystal structures for interactions.''' _telemetry = 0
[docs] class Settings(Search.Settings): '''Settings appropriate to a substructure search.''' _enantiomer_match_type_dict = utilities.bidirectional_dict( NEVER=MotifSearchLib.EnantiomerSensitiveConstraint.NEVER, SPACEGROUP_DEPENDENT=MotifSearchLib.EnantiomerSensitiveConstraint.SPACEGROUP_DEPENDENT, ALWAYS=MotifSearchLib.EnantiomerSensitiveConstraint.ALWAYS, ) def __init__(self, max_hit_structures=None, max_hits_per_structure=None): settings = CSDSQLDatabaseLib.CrystalStructureDatabaseMotifSearchSettings() settings.set_match_mode(CSDSQLDatabaseLib.CrystalStructureDatabaseMotifSearchSettings.MATCH_3D_CRYSTAL_ONLY) if max_hit_structures is not None: settings.set_maximum_hits_limit(max_hit_structures) if max_hits_per_structure is None: settings.set_maximum_hits_per_structure(0) else: settings.set_maximum_hits_per_structure(max_hits_per_structure) Search.Settings.__init__(self, _settings=settings) self._match_enantiomers = MotifSearchLib.EnantiomerSensitiveConstraint.NEVER @property def max_hits_per_structure(self): '''Maximum number of hits per structure.''' return self._settings.maximum_hits_per_structure() @max_hits_per_structure.setter def max_hits_per_structure(self, value): self._settings.set_maximum_hits_per_structure(value) @property def match_enantiomers(self): '''Enantiomer matching behavior The value will be one of 'NEVER' meaning enantiomers are never checked, 'SPACEGROUP_DEPENDENT' meaning enantiomers are checked if the crystal's spacegroup implies the presence of enantiomers, or 'ALWAYS' meaning enantiomers are always checked. ''' return SubstructureSearch.Settings._enantiomer_match_type_dict.inverse_lookup(self._match_enantiomers) @match_enantiomers.setter def match_enantiomers(self, value): self._match_enantiomers = SubstructureSearch.Settings._enantiomer_match_type_dict[value]
[docs] class HitProcessor(object): '''Override this class to provide your own add_hit() method. This class allows a search to process hits as they are found by the search class, rather than waiting until all hits are found before allowing access to them, a procedure which may well run out of memory for very general searches. '''
[docs] def search(self, searcher, database=None): '''Searches the database with the substructure search. :param searcher: a :class:`ccdc.search.SubstructureSearch` instance. :param database: a :class:`ccdc.io.EntryReader` instance. If not specified the CSD will be searched. For each hit found, :meth:`ccdc.Search.SubstructureSearch.HitProcessor.add_hit` will be called with a :class:`ccdc.search.SubstructureSearch.SubstructureHit` instance. ''' self._cancelled = False self.searcher = searcher self.searcher._add_enantiomer_consistency() if database is None: database = EntryReader('csd') self.database = database if not hasattr(self.database, '_motif_searcher'): try: self.database._motif_searcher = self.database._db.searcher_factory().motif_searcher() except (RuntimeError, AttributeError): pass if hasattr(self.database, '_motif_searcher'): self.database._motif_searcher.progress_monitor().reset() results_writer = CSDSQLDatabaseLib.PythonResultsWriter(self) self.database._motif_searcher.search( self.searcher._motif, self.searcher.settings._settings, results_writer ) else: # there used to be fallback code here, but now we expect to always support motif search on any database raise NotImplementedError("Substructure search is not implemented on this database type")
def __call__(self, **kw): '''Private: this method will be called from the search.''' if 'max_hits_reached' in kw: self.cancel() #print('Max hits reached') elif 'match' in kw: h = SubstructureSearch.SubstructureHit._from_match( kw['match'], self.searcher, _binary_database=self.database._db ) self.add_hit(h) elif 'hit' in kw: self.add_hit(kw['hit']) else: raise RuntimeError('Unknown keyword in __call__', kw)
[docs] def cancel(self): '''Cancels the search.''' try: self.database._motif_searcher.progress_monitor().cancel() except AttributeError: pass self._cancelled = True
[docs] def add_hit(self, hit): '''Override this to provide your own hit processing.''' raise NotImplementedError('add_hit() must be implemented.')
class _MotifMatchHit(Search.SearchHit): '''A hit with motif match results.''' def __init__(self, identifier, match=None, _database=None, _entry=None, _crystal=None, _molecule=None, _binary_database=None): if _database is not None or _binary_database is not None: _entry = _crystal = _molecule = None super(SubstructureSearch._MotifMatchHit, self).__init__( identifier, _database=_database, _entry=_entry, _crystal=_crystal, _molecule=_molecule, _binary_database=_binary_database) self._motif_match = match def match_components(self): ''' Return the molecular components containing the atoms matched by the search. :returns: list of :class:`ccdc.molecule.Molecule` ''' csv = ChemistryLib.CrystalStructureView.instantiate(self.crystal._crystal) ss = MotifSearchLib.MotifSearchStructure(csv) match_mols = set([ molecule.Molecule('%02d' % i, _molecule=ss.molecule(self._motif_match.substructure_match(i)).create_editable_molecule()) for i in range(self._motif_match.nsubstructure_matches()) ]) return list(match_mols) def match_atoms(self, indices=False): ''' Return the atoms matched by the substructure. :param indices: Whether to return atom indices instead of :class:`ccdc.molecule.Atom` instances :returns: list of :class:`ccdc.molecule.Atom` instances or atom indices The atoms returned will all be in the asymmetric unit, so directly measuring constraints and measurements from these atoms will not give the correct results if a symmetry-generated copy was involved in the match. See :meth:`ccdc.search.SubstructureSearch.SubstructureHit.match_symmetry_operators` for a way to determine if this is the case. ''' if not hasattr(self, '_real_indices'): csv = ChemistryLib.CrystalStructureView.instantiate(self.crystal._crystal) ss = MotifSearchLib.MotifSearchStructure(csv) match_atoms = [] mol = self.molecule def _matches(a, b, depth=0): if a.coordinates is None: if b.site() is None: # match labels here and first neighbours if depth >= 2: return True return a.label == b.label() and (len(a.neighbours) == 0 or len(b.get_neighbours()) == 0 or _matches(a.neighbours[0], b.get_neighbours()[0], depth+1)) else: return False else: if b.site() is None: return False else: return (a.label == b.label() and round(a.coordinates.x, 3) == round(b.site().orth().x(), 3) and round(a.coordinates.y, 3) == round(b.site().orth().y(), 3) and round(a.coordinates.z, 3) == round(b.site().orth().z(), 3) ) for j in range(self._motif_match.nsubstructure_matches()): sub_matches = [] for i in range(len(self._motif_match.substructure_match(j).atom_match())): _atom = ss.atom(self._motif_match, j, i) _base = csv.base_atom(_atom) # try same index first added = False if _atom.index() < len(mol.atoms): a = mol.atoms[_atom.index()] if not a in sub_matches and _matches(a, _base): sub_matches.append(a) added = True if not added: for a in mol.atoms: if not a in sub_matches and _matches(a, _base): sub_matches.append(a) break else: raise RuntimeError('No matching atom??? %s %s - %s %s in %s' % (_atom.label(), str(_atom.site().orth()), _base.label(), str(_base.site().orth()), self.identifier)) match_atoms += sub_matches self._match_atoms = match_atoms self._real_indices = tuple(a.index for a in self._match_atoms) if indices: return tuple(self._real_indices) else: return self._match_atoms def match_substructures(self): '''Returns each substructure of the hit as a molecule with the bonds and atoms of the hit. The symmetry operations of the hit will be applied to the molecules, so measurement and constraints will be appropriate to the hit. :returns: tuple of :class:`ccdc.molecule.Molecule`, one for each substructure of the hit with the bonds and atoms of the hit ''' csv = ChemistryLib.CrystalStructureView.instantiate(self.crystal._crystal) mss = MotifSearchLib.MotifSearchStructure(csv) at_matches = [self._motif_match.substructure_match(i).atom_match() for i in range(self._motif_match.nsubstructure_matches())] at_addrs = [[self._motif_match.atom_address(i, j) for j in range(len(at_matches[i]))] for i in range(len(at_matches))] ats = [[mss.atom(aa) for aa in l] for l in at_addrs] api_ats = [[molecule.Atom(_atom=a) for a in l] for l in ats] api_mols = [molecule.Molecule(self.identifier, _molecule=l[0].molecule().create_editable_molecule()) for l in ats] def _matching_ats(a, b): return ( a.atomic_symbol == b.atomic_symbol and a.label == b.label and str(a.coordinates) == str(b.coordinates) ) for i, m in enumerate(api_mols): m._molecule.reorder_atoms([a.index() for a in ats[i]]) m.remove_atoms(a for a in m.atoms if not any(_matching_ats(a, b) for b in api_ats[i])) return tuple(api_mols) def match_symmetry_operators(self): '''The symmetry operators required to form the match. :returns: a list of symmetry operators in the order of the matched atoms. ''' crystal = self.crystal ats = self.match_atoms() csv = ChemistryLib.CrystalStructureView.instantiate(crystal._crystal) mss = MotifSearchLib.MotifSearchStructure(csv) motif_match = self._motif_match def _get_symmop(a): '''Get the appropriate symmop.''' z = a sub = 0 while True: subm = motif_match.substructure_match(sub) if z >= len(subm.atom_match()): z -= len(subm.atom_match()) sub += 1 else: break at = mss.atom(motif_match, sub, z) base = csv.base_asymmetric_unit_atom(at) op = ChemistryLib.atom_atom_symmetry_relation(crystal._crystal, base, at) if op: symmop = op.to_string() else: symmop = '' return symmop symmops = [_get_symmop(i) for i in range(len(ats))] return symmops
[docs] class SubstructureHit(_MotifMatchHit): '''A hit from a substructure search.''' def __init__(self, identifier, match=None, search_structure=None, query=None, _database=None, _entry=None, _crystal=None, _molecule=None, _binary_database=None): super(SubstructureSearch.SubstructureHit, self).__init__( identifier, match, _database=_database, _entry=_entry, _crystal=_crystal, _molecule=_molecule, _binary_database=_binary_database) self._disorder_dealt_with = False self._search = query if match is not None: #self._make_geometric_objects() self._measure_measurements() self.query = query self._geometric_objects = None @staticmethod def _from_match(m, search, _binary_database=None, _database=None, _entry=None, _crystal=None, _molecule=None): '''Private: construct a SubstructureHit from a match object.''' h = SubstructureSearch.SubstructureHit( m.identifier().str(), m.data().motif_match(), query=search, _binary_database=_binary_database, _database=_database, _entry=_entry, _crystal=_crystal, _molecule=_molecule ) return h def _make_geometric_object(self, obj, search_structure): '''PRIVATE: make a geometric object.''' if isinstance(obj, (SubstructureSearchLib.ConstraintAtomPoint, SubstructureSearchLib.ConstraintCentroidPoint, SubstructureSearchLib.ConstraintDummyPoint, SubstructureSearchLib.ConstraintPoint)): p0 = self._motif_match.get_point(MotifSearchLib.Object_as_Point(obj), search_structure) return molecule.Coordinates(p0.x(), p0.y(), p0.z()) elif isinstance(obj, SubstructureSearchLib.ConstraintPlane): p = GeometricDescriptors.Plane( None, None, _plane=self._motif_match.get_plane(obj, search_structure) ) return p elif isinstance(obj, SubstructureSearchLib.ConstraintVector): vec = self._motif_match.get_vector(obj, search_structure) p = GeometricDescriptors.Vector(vec.x(), vec.y(), vec.z()) return p elif isinstance(obj, SubstructureSearchLib.ConstraintAtomGroup): _csv = ChemistryLib.CrystalStructureView.instantiate(self.crystal._crystal) _mss = MotifSearchLib.MotifSearchStructure(_csv) mgsm = MotifSearchLib.MotifGeometricSearchMatch(self._motif_match, _mss) atoms = obj.atoms(mgsm) return tuple(molecule.Atom(_atom=a) for a in atoms) raise NotImplementedError('Have not implemented geometric object %s' % obj) def _make_geometric_objects(self): '''PRIVATE: make all the geometric objects.''' if not self._search.geometric_objects: self._geometric_objects = {} return _csv = ChemistryLib.CrystalStructureView.instantiate(self.crystal._crystal) _mss = MotifSearchLib.MotifSearchStructure(_csv) self._geometric_objects = { name : self._make_geometric_object(obj, _mss) for name, obj in self._search.geometric_objects.items() } @property def geometric_objects(self): if self._geometric_objects is None: self._make_geometric_objects() return self._geometric_objects def _measure_measurements(self): '''PRIVATE: make all the measurements.''' self.measurements = dict() self.constraints = dict() for i in range(self._motif_match.nparameters()): mp = self._motif_match.parameter_value(i) if mp.parameter().name() in self._search.measurements: self.measurements[mp.parameter().name()] = mp.value() else: self.constraints[mp.parameter().name()] = mp.value()
[docs] def measurement_atoms(self, name): '''The atoms involved in a measurement. :param name: the name of the measurement. :returns: a tuple of :class:`ccdc.molecule.Atom` instances. The atoms will be returned in an arbitrary order. All atoms involved in the measurement will be present, so for example a centroid-centroid distance measurement will produce the atoms of both centroids. ''' con = self._search.measurements[name] _csv = ChemistryLib.CrystalStructureView.instantiate(self.crystal._crystal) _mss = MotifSearchLib.MotifSearchStructure(_csv) mgsm = MotifSearchLib.MotifGeometricSearchMatch(self._motif_match, _mss) res = con.test(mgsm) ats = res.get_atoms() return tuple(molecule.Atom(_atom=a) for a in ats)
[docs] def constraint_atoms(self, name): '''The atoms from which the constraint was defined. :param name: the name of the constraint. :returns: a tuple of :class:`ccdc.molecule.Atom` instances. The atoms will be returned in an arbitrary order. All atoms involved in defining the constraint will be returned. ''' con = self._search.constraints.get(name, self._search.contacts[name]) _csv = ChemistryLib.CrystalStructureView.instantiate(self.crystal._crystal) _mss = MotifSearchLib.MotifSearchStructure(_csv) mgsm = MotifSearchLib.MotifGeometricSearchMatch(self._motif_match, _mss) if isinstance(con, SubstructureSearchLib.SubstructureContact): sub1 = con.substruct_a() at1 = con.atom_a() sub2 = con.substruct_b() at2 = con.atom_b() addr1 = self._motif_match.atom_address(sub1, at1) addr2 = self._motif_match.atom_address(sub2, at2) return molecule.Atom(_atom=_mss.atom(addr1)), molecule.Atom(_atom=_mss.atom(addr2)) elif isinstance(con, SubstructureSearchLib.SubstructureObjectContact): obj1 = con.object_a() obj2 = con.object_b() return tuple(a for a in self._geometric_object_atoms(obj1.label())) + \ tuple(a for a in self._geometric_object_atoms(obj2.label())) res = con.test(mgsm) ats = res.get_atoms() return tuple(molecule.Atom(_atom=a) for a in ats)
def _geometric_object_atoms(self, name): '''PRIVATE: the matched atoms of a constraint object.''' _csv = ChemistryLib.CrystalStructureView.instantiate(self.crystal._crystal) _mss = MotifSearchLib.MotifSearchStructure(_csv) mgsm = MotifSearchLib.MotifGeometricSearchMatch(self._motif_match, _mss) atoms = self._search.geometric_objects[name].atoms(mgsm) return tuple(molecule.Atom(_atom=a) for a in atoms)
[docs] def centroid_atoms(self, name): '''The atoms from which the centroid is derived.''' return self._geometric_object_atoms(name)
[docs] def dummy_point_atoms(self, name): '''The atoms from which the dummy point was defined.''' return self._geometric_object_atoms(name)
[docs] def group_atoms(self, name): '''The atoms from which the group was defined.''' return self._geometric_object_atoms(name)
[docs] def vector_atoms(self, name): '''The atoms from which the vector was defined.''' return self._geometric_object_atoms(name)
[docs] def plane_atoms(self, name): '''The atoms from which the plane was defined.''' return self._geometric_object_atoms(name)
### Object names _constraint_types = dict( PlaneAngleConstraint=SubstructureSearchLib.GeometricConstraint_as_PlaneAngleConstraint, PointAngleConstraint=SubstructureSearchLib.GeometricConstraint_as_PointAngleConstraint, PointDistanceConstraint=SubstructureSearchLib.GeometricConstraint_as_PointDistanceConstraint, PointPlaneDistanceConstraint=SubstructureSearchLib.GeometricConstraint_as_PointPlaneDistanceConstraint, PointTorsionConstraint=SubstructureSearchLib.GeometricConstraint_as_PointTorsionConstraint, VectorAngleConstraint=SubstructureSearchLib.GeometricConstraint_as_VectorAngleConstraint, VectorPlaneAngleConstraint=SubstructureSearchLib.GeometricConstraint_as_VectorPlaneAngleConstraint, Atom3DPropertyConstraint=SubstructureSearchLib.GeometricConstraint_as_Atom3DPropertyConstraint, ConstantValueConstraint=SubstructureSearchLib.GeometricConstraint_as_ConstantValueConstraint, TransformConstraint=SubstructureSearchLib.GeometricConstraint_as_TransformConstraint, UnaryTransformConstraint=SubstructureSearchLib.GeometricConstraint_as_UnaryTransformConstraint, BinaryTransformConstraint=SubstructureSearchLib.GeometricConstraint_as_BinaryTransformConstraint, ) @staticmethod def _find_objects(constraint): real_con = SubstructureSearch.SubstructureHit._constraint_types[constraint.class_name()](constraint) if real_con.class_name() == 'UnaryTransformConstraint': return SubstructureSearch.SubstructureHit._find_objects(real_con.sub_constraint()) if real_con.class_name() == 'BinaryTransformConstraint': return SubstructureSearch.SubstructureHit._find_objects(real_con.sub_constraint1()) + SubstructureSearch.SubstructureHit._find_objects(real_con.sub_constraint2()) if real_con.class_name() == 'ConstantValueConstraint': return () objs = tuple(real_con.objects(i) for i in range(real_con.nobjects())) return objs def _object_name(self, _object): n = _object.label() if ':' in n: bits = n.split(':') if len(bits) == 2 and all(x.isdigit() for b in bits for x in b): sub_inx = int(bits[0]) at_inx = int(bits[1]) substructs = self.match_substructures() while at_inx >= len(substructs[sub_inx].atoms): at_inx -= len(substructs[sub_inx].atoms) sub_inx += 1 at = self.match_substructures()[sub_inx].atoms[at_inx] return at return _object.label()
[docs] def measurement_objects(self, measurement): '''A tuple of object names and atoms from which the measurement was taken. :param measurement: the string name of the measurement. :returns: a tuple of geometric object names or atoms. ''' _constraint = self._search.measurements[measurement] return tuple(self._object_name(obj) for obj in self._find_objects(_constraint))
[docs] def constraint_objects(self, constraint): '''A tuple of object names and atoms from which the constraint was defined.''' _constraint = self._search.constraints.get(constraint, self._search.contacts.get(constraint)) if _constraint is None: raise IndexError('The constraint %s could not be found' % constraint) return tuple(self._object_name(obj) for obj in self._find_objects(_constraint))
def _geometric_object_objects(self, _obj): return tuple(self._object_name(_obj.objects(i)) for i in range(_obj.nobjects()))
[docs] def centroid_objects(self, name): '''The geometric object names and atoms from which the centroid was defined.''' return self._geometric_object_objects(self._search.geometric_objects[name])
[docs] def dummy_point_objects(self, name): '''The geometric object names and atoms from which the dummy point was defined.''' return self._geometric_object_objects(self._search.geometric_objects[name])
[docs] def group_objects(self, name): '''The geometric object names and atoms from which the group was defined.''' return self._geometric_object_objects(self._search.geometric_objects[name])
[docs] def vector_objects(self, name): '''The geometric object names and atoms from which the vector was defined.''' return self._geometric_object_objects(self._search.geometric_objects[name])
[docs] def plane_objects(self, name): '''The geometric object names and atoms from which the plane was defined.''' return self._geometric_object_objects(self._search.geometric_objects[name])
[docs] class SubstructureHitList(list): '''List of hits from a :class:`ccdc.search.SubstructureSearch`'''
[docs] def superimpose(self): '''Superimpose all matched molecules on their query atoms Just superimpose on first substructure ''' ret = [] if self: for i in range(len(self)): if self[i].match_atoms(): inx0 = self[i].match_atoms() mol0 = self[i].molecule.copy() got_one = i break else: raise RuntimeError('No structure has matching atoms') ret.append(self[got_one].molecule.copy()) for h in self[got_one+1:]: inx1 = h.match_atoms() mol1 = h.molecule.copy() overlay = MolecularDescriptors.Overlay(mol0, mol1, atoms=zip(inx0, inx1)) ret.append(overlay.molecule) return ret
[docs] def write_c2m_file(self, file_name): '''Write a ConQuest to Mercury interchange file. This file allows substructure search results to be read into the data analysis package of Mercury. :param file_name: file to which the data will be written. ''' if not self: raise RuntimeError('No hits to write') def split_by_id(): '''Split the hit list by identifier.''' parts = [] identifier = None for h in self: if h.identifier != identifier: identifier = h.identifier parts.append([]) parts[-1].append(h) return parts def make_atom(h, i, a, done, output): '''Make a tag representing an individual atom.''' motif_match = h._motif_match crystal = h.crystal csv = ChemistryLib.CrystalStructureView.instantiate(crystal._crystal) mss = MotifSearchLib.MotifSearchStructure(csv) # find the right substructure_match z = i sub = 0 while True: subm = motif_match.substructure_match(sub) if z >= len(subm.atom_match()): z -= len(subm.atom_match()) sub += 1 else: break # Now subm is the right match, z is the right index in the match fo = a._atom.annotations().obtain_FileOrdering().file_order() subm.translation() at = mss.atom(motif_match, sub, z) base = csv.base_asymmetric_unit_atom(at) ChemistryLib.atom_atom_symmetry_relation(crystal._crystal, base, at) if fo in done: fo = at.annotations().obtain_FileOrdering().file_order() done.add(fo) output.write('<atom id="%d" substructure_index="%d" aser_index="%d"/>\n' % (i, i, fo)) def make_atoms(h, output): '''Make the atoms tag.''' output.write('<atoms>\n') s = set() for i, a in enumerate(h.match_atoms()): if a._atom.annotations().obtain_FileOrdering().file_order() != 0: make_atom(h, i, a, s, output) output.write('</atoms>\n') def make_absolute_index(motif_match, sub, a): '''Make the atom index absolute.''' atinx = a for i in range(sub): atinx += len(motif_match.substructure_match(i).atom_match()) return atinx def make_measure(h, name, value, output): '''Make a parameter tag for a measurement.''' motif_match = h._motif_match cc = h._search.measurements[name] if cc.label() == name: objects = [cc.objects(i) for i in range(cc.nobjects())] pts = [SubstructureSearchLib.Object_as_AtomPoint(o) for o in objects] sub_at_inxs = [(p.substructure_index(), p.atom_index()) for p in pts] at_inxs = [make_absolute_index(motif_match, s, a) for s, a in sub_at_inxs] atom_inxs = ','.join('%d' % a for a in at_inxs) if cc.nobjects() == 2: flavour = 'distance' elif cc.nobjects() == 3: flavour = 'angle' elif cc.nobjects() == 4: flavour = 'torsion' else: raise NotImplementedError('Need the other constraints ' + str(type(cc)) + ' ' + name) output.write('<parameter name="%s" type="%s" value="%.3f">\n' % (name, flavour, value)) output.write('<atom_ids>%s</atom_ids>\n' % atom_inxs) output.write('</parameter>\n') def _get_atom_inxs(h, ct, name): sub_atoms = [a for s in h.match_substructures() for a in s.atoms] con = h._search.constraints[name] pts = [SubstructureSearchLib.Object_as_AtomPoint(con.objects(i)) for i in range(ct)] inxs = [make_absolute_index(h._motif_match, p.substructure_index(), p.atom_index()) for p in pts] return inxs def make_constraint(h, name, value, output): '''Make a parameter tag for a constraint or contact.''' motif_match = h._motif_match for tag, cc in h._search.constraints.items(): if tag == name: if isinstance(cc, SubstructureSearchLib.SubstructureContact): atom_inxs = [ make_absolute_index(motif_match, cc.substruct_a(), cc.atom_a()), make_absolute_index(motif_match, cc.substruct_b(), cc.atom_b()) ] flavour = 'contact' break elif cc.class_name() == 'PointAngleConstraint': atom_inxs = _get_atom_inxs(h, 3, name) flavour = 'angle' break elif cc.class_name() == 'PointTorsionConstraint': atom_inxs = _get_atom_inxs(h, 4, name) flavour = 'torsion' break elif cc.class_name() == 'PointDistanceConstraint': atom_inxs = _get_atom_inxs(h, 2, name) flavour = 'distance' break else: raise NotImplementedError('Need the other constraints') else: raise AttributeError('No constraint named %s' % name) output.write('<parameter name="%s" type="%s" value="%.3f">\n' % (name, flavour, value)) output.write('<atom_ids>%s</atom_ids>\n' % (','.join('%d' % a for a in atom_inxs))) output.write('</parameter>\n') def make_params(h, output): '''Make the parameters tag.''' output.write('<parameters>\n') for p, v in h.measurements.items(): make_measure(h, p, v, output) for c, v in h.constraints.items(): make_constraint(h, c, v, output) output.write('</parameters>\n') def make_fragment(h, inx, output): '''Make a single fragment tag.''' output.write('<fragment type="3d_hit_fragment" fragment_index="%d" query_index="1">\n' % (inx+1)) make_atoms(h, output) make_params(h, output) h._molecule = None output.write('</fragment>\n') def make_fragments(p, output): '''Make all the fragment tags.''' output.write('<fragments>\n') for i, h in enumerate(p): make_fragment(h, i, output) output.write('</fragments>\n') def make_match(p, output): '''Make a match tag.''' h = p[0] if h._database: db = h._database._real_database(h.identifier) csd_loc = db.file_name if hasattr(db, 'inf_file'): csd_inf = db.inf_file().full_path() else: csd_inf = None else: db = h._binary_database if db is None: csd_loc = "" else: try: csd_loc = db.file_name() except (AttributeError, RuntimeError): try: csd_loc = db._db.source_database_name( UtilitiesLib.DatabaseEntryIdentifier(h.identifier) ) except: csd_loc = "" else: if '_ASER' in csd_loc: csd_loc = csd_loc.replace('_ASER', '') csd_inf = None output.write('<match identifier="%s">\n' % p[0].identifier) output.write('<database>%s</database>\n' % csd_loc) if csd_inf: output.write('<inffile>%s</inffile>\n' % csd_inf) make_fragments(p, output) output.write('</match>\n') def make_tree(parts, output): '''Make the whole XML tree.''' output.write('<search_results version="2.0">\n') output.write('<search_label>search1</search_label>\n') output.write('<search_id>123456</search_id>\n') output.write('<active_hit>%s</active_hit>\n' % self[0].identifier) output.write('<action>analyse</action>\n') for p in parts: make_match(p, output) output.write('</search_results>\n') parts = split_by_id() if not file_name.endswith('.c2m'): file_name += '.c2m' with open(file_name, 'w') as writer: make_tree(parts, writer)
def __init__(self, settings=None): '''Initialise the query.''' self._motif = MotifSearchLib.Motif('') self.substructures = [] self.measurements = dict() self.constraints = dict() self.contacts = dict() self.geometric_objects = dict() if settings is None: settings = SubstructureSearch.Settings() self.settings = settings if type(self)._telemetry == 0: UtilitiesLib.ccdc_motif_search_telemetry() type(self)._telemetry = 1
[docs] def add_substructure(self, substructure): '''Add a substructure. Disconnected substructures may be accepted if the first substructure is contiguous at the start. Multiple substructures may be added as a result. :param substructure: :class:`ccdc.search.QuerySubstructure`. :returns: the index of the first substructure added. ''' if isinstance(substructure, ConnserSubstructure): self._conn_motif = _motif = MotifSearchLib.create_motif(substructure._conn) self._motif = _motif if len(self.substructures) > 0: # Need to relocate the new substructure(s) raise NotImplementedError('Need to relocate substructures') # Currently needed, but I'm not sure if it's still relevant inxs = list(range(_motif.nsubstructures())) for i in range(_motif.nsubstructures()): self.substructures.append(QuerySubstructure(_substructure=_motif.substructure(i))) _conn = substructure._conn possibles = [] for i in range(_conn.n_geometric_constraints()): c = _conn.geometric_constraint(i) if not c.label().startswith('?'): self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(c)) cond = c.condition() if cond.name() == 'always false': possibles.append(c) else: self.constraints[c.label()] = c for inx in range(c.nobjects()): obj = c.objects(inx) object_types = dict( ConstraintAtomGroup=SubstructureSearchLib.Object_as_AtomGroup, ConstraintPlane=SubstructureSearchLib.Object_as_Plane, ConstraintAtomPoint=SubstructureSearchLib.Object_as_AtomPoint, ConstraintCentroidPoint=SubstructureSearchLib.Object_as_CentroidPoint, ConstraintDummyPoint=SubstructureSearchLib.Object_as_DummyPoint, ConstraintVector=SubstructureSearchLib.Object_as_Vector, ) if ':' not in obj.label(): self.geometric_objects[obj.label()] = object_types.get(obj.class_name(), lambda x:x)(obj) contacts = _conn.contacts() for c, p in zip(contacts, possibles): clone = p.clone() cond = SubstructureSearchLib.InclusiveRange(c.criterion().min(), c.criterion().max()) clone.set_condition(cond) self.constraints[p.label()] = clone for p in possibles[len(contacts):]: self.measurements[p.label()] = p for i in range(_motif.n_objects_contacts()): oc = _motif.object_contact(i) a = oc.object_a() if a.class_name() != 'ConstraintAtomPoint': if ':' not in a.label(): self.geometric_objects[a.label()] = SubstructureSearchLib.Object_as_Point(a) b = oc.object_b() if b.class_name() != 'ConstraintAtomPoint': if ':' not in b.label(): self.geometric_objects[b.label()] = SubstructureSearchLib.Object_as_Point(b) dist_con = SubstructureSearchLib.PointDistanceConstraint( SubstructureSearchLib.Object_as_Point(a), SubstructureSearchLib.Object_as_Point(b), SubstructureSearchLib.AlwaysTrue(), 'CONT%d' % (i+1) ) self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(dist_con)) self.constraints['CONT%d' % (i+1)] = dist_con #self.constraints['%s_%s_CONTACT' % (a.label(), b.label())] = oc #self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(oc)) for i in range(_motif.nobjects()): obj = self.geometric_object(i) if ':' not in obj.label(): self.geometric_objects[obj.label()] = obj return inxs else: sizes = [substructure._substructure.natoms()] self.substructures.append(substructure) inxs = [self._motif.add_substructure(substructure._substructure)] if len(inxs) == 1: return inxs[0] else: return inxs
def _point_must_have_site(self, sub_id, atom_id): sub = self.substructures[sub_id] atom = sub.atoms[atom_id] if not atom._substructure_atom.has_constraint_of_type(SubstructureSearchLib.AtomHas3DSiteConstraint()): atom.has_3d_coordinates = True def _args_to_points(self, required, args, require_3d=True): i = 0 points = [] while i < len(args): a = args[i] if isinstance(a, int): # Old style points.append(SubstructureSearchLib.ConstraintAtomPoint(a, args[i+1])) if require_3d: self._point_must_have_site(a, args[i+1]) i += 2 elif isinstance(a, (tuple, list)): # New style points.append(SubstructureSearchLib.ConstraintAtomPoint(a[0], a[1])) if require_3d: self._point_must_have_site(a[0], a[1]) i += 1 elif isinstance(a, str): g = self.geometric_objects[a] if isinstance(g, SubstructureSearchLib.ConstraintAtomGroup): points.append(g.centroid_from_group()) else: points.append(self.geometric_objects[a]) i += 1 else: raise TypeError('Invalid type for a point') assert required == 0 or required == len(points) return points ##### GeometricObjects #####
[docs] def add_centroid(self, name, *args): '''Adds a centroid to the substructure search. :param name: the name by which the centroid will be accessed. :param `*args`: the points or geometric objects from which to define the centroid. Each arg may be either a pair (substructure_index, atom_index) or the name of a geometric object. There must be at least two such arguments. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_centroid('CENT1', (0, 0), (0, 1), (0, 2)) >>> query.add_centroid('CENT2', (1, 0), (1, 1), (1, 2)) >>> query.add_centroid('CENT3', 'CENT1', 'CENT2') ''' points = self._args_to_points(0, args) centroid = SubstructureSearchLib.ConstraintCentroidPoint(points, name) self.geometric_objects[name] = centroid self._motif.add_object(centroid)
[docs] def add_dummy_point(self, name, distance, *args): '''Creates a dummy point along a vector. :param name: the name by which this point will be accessed. :param distance: the distance along the vector subtentended by the two points. :param `*args`: two points specified as (substructure_index, atom_index) or the name of another geometric object. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_centroid('CENT1', (0, 0), (0, 1), (0, 2)) >>> query.add_dummy_point('DUM1', 2.0, 'CENT1', (1, 1)) ''' points = self._args_to_points(2, args) dummy = SubstructureSearchLib.ConstraintDummyPoint(points[0], points[1], distance, name) self.geometric_objects[name] = dummy self._motif.add_object(dummy)
[docs] def add_group(self, name, *args): '''Creates a group of matched atoms. :param name: the name by which this group will be accessed. :param `*args`: pairs, (substructure_index, atom_index) defining the atoms of the group. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_group('GP1', (0, 0), (0, 1), (0, 2)) ''' points = self._args_to_points(0, args) group = SubstructureSearchLib.ConstraintAtomGroup(points, name) self.geometric_objects[name] = group self._motif.add_object(group)
[docs] def add_vector(self, name, *args): '''Add a vector. :param name: the name by which the vector will be accessed. :param `*args`: two point specifications as (substructure_index, atom_index) or the name of another geometric object. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_centroid('CENT1', (0, 0), (0, 1), (0, 2)) >>> query.add_vector('VEC1', 'CENT1', (1, 2)) ''' points = self._args_to_points(2, args) vec = SubstructureSearchLib.ConstraintVector(points[0], points[1], name) self._motif.add_object(vec) self.geometric_objects[name] = vec
[docs] def add_plane(self, name, *args): '''Add a plane. :param name: the name by which the plane will be accessed. :param `*args`: at least two point specifications in the form (substructure_index, atom_index) or the name of another geometric object. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_plane('PLANE1', (0, 0), (0, 1), (0, 2)) >>> query.add_plane('PLANE2', (1, 0), (1, 1), (1, 2)) ''' points = self._args_to_points(0, args) plane = SubstructureSearchLib.ConstraintPlane(points, name) self._motif.add_object(plane) self.geometric_objects[name] = plane
##### Measurements #####
[docs] def add_distance_measurement(self, name, *args): '''Add a distance measurement. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_centroid('CENT1', (0, 0), (0, 1), (0, 2)) >>> query.add_centroid('CENT2', (1, 0), (1, 1), (1, 2)) >>> query.add_distance_measurement('DIST1', (0, 0), 'CENT2') ''' points = self._args_to_points(2, args) constraint = SubstructureSearchLib.PointDistanceConstraint( points[0], points[1], SubstructureSearchLib.AlwaysTrue(), name ) self.measurements[name] = constraint self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(constraint))
[docs] def add_angle_measurement(self, name, *args): '''Add an angle measurement. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_centroid('CENT1', (0, 0), (0, 1), (0, 2)) >>> query.add_centroid('CENT2', (1, 0), (1, 1), (1, 2)) >>> query.add_angle_measurement('ANG1', (0, 0), (1, 1), (1, 0)) ''' points = self._args_to_points(3, args) constraint = SubstructureSearchLib.PointAngleConstraint( points[0], points[1], points[2], SubstructureSearchLib.AlwaysTrue(), name ) self.measurements[name] = constraint self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(constraint))
[docs] def add_torsion_angle_measurement(self, name, *args): '''Add a torsion angle measurement. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_centroid('CENT1', (0, 0), (0, 1), (0, 2)) >>> query.add_centroid('CENT2', (1, 0), (1, 1), (1, 2)) >>> query.add_torsion_angle_measurement('ANG1', (0, 0), (0, 1), (1, 1), (1, 0)) ''' points = self._args_to_points(4, args) constraint = SubstructureSearchLib.PointTorsionConstraint( points[0], points[1], points[2], points[3], SubstructureSearchLib.AlwaysTrue(), name ) self.measurements[name] = constraint self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(constraint))
[docs] def add_vector_angle_measurement(self, name, *args): '''Add a vector angle measurement. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_vector('VEC1', (0, 1), (1, 2)) >>> query.add_vector('VEC2', (0, 2), (1, 1)) >>> query.add_vector_angle_measurement('ANG1', 'VEC1', 'VEC2') ''' points = self._args_to_points(2, args) constraint = SubstructureSearchLib.VectorAngleConstraint( points[0], points[1], SubstructureSearchLib.AlwaysTrue(), name ) self.measurements[name] = constraint self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(constraint))
[docs] def add_plane_angle_measurement(self, name, *args): '''Add a plane angle measurement. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_plane('PLANE1', (0, 0), (0, 1), (0, 2)) >>> query.add_plane('PLANE2', (1, 0), (1, 1), (1, 2)) >>> query.add_plane_angle_measurement('PA1', 'PLANE1', 'PLANE2') ''' points = self._args_to_points(2, args) constraint = SubstructureSearchLib.PlaneAngleConstraint( points[0], points[1], SubstructureSearchLib.AlwaysTrue(), name ) self.measurements[name] = constraint self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(constraint))
[docs] def add_point_plane_distance_measurement(self, name, *args): '''Add point plane distance measurement. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_centroid('CENT1', (0, 0), (0, 1), (0, 2)) >>> query.add_plane('PLANE2', (1, 0), (1, 1), (1, 2)) >>> query.add_point_plane_distance_measurement('PP1', 'CENT1', 'PLANE2') ''' points = self._args_to_points(2, args) constraint = SubstructureSearchLib.PointPlaneDistanceConstraint( points[0], points[1], SubstructureSearchLib.AlwaysTrue(), name, SubstructureSearchLib.PointPlaneDistanceConstraint.ABSOLUTE ) self.measurements[name] = constraint self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(constraint))
[docs] def add_vector_plane_angle_measurement(self, name, *args): '''Add a vector plane angle measurement. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_vector('VEC1', (0, 1), (1, 2)) >>> query.add_plane('PLANE2', (1, 0), (1, 1), (1, 2)) >>> query.add_vector_plane_angle_measurement('ANG1', 'VEC1', 'PLANE2') ''' points = self._args_to_points(2, args) constraint = SubstructureSearchLib.VectorPlaneAngleConstraint( points[0], points[1], SubstructureSearchLib.AlwaysTrue(), name ) self.measurements[name] = constraint self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(constraint))
[docs] def add_atom_property_measurement(self, name, *args, **kw): '''Add an atom property measurement. :param name: the name by which this measurement will be accessed. :param `*args`: a pair, (substructure_index, atom_index) specifying the atom to measure. :param which: one of TotalCoordinationNumber, AtomicNumber, VdwRadius, CovalentRadius >>> query = SubstructureSearch() >>> substructure = QuerySubstructure() >>> _ = substructure.add_atom(['C', 'N']) >>> _ = query.add_substructure(substructure) >>> query.add_atom_property_measurement('ATOM1', (0, 0), which='AtomicNumber') ''' _which_dic = utilities.bidirectional_dict( TotalCoordinationNumber=SubstructureSearchLib.Atom3DPropertyConstraint.TotalCoordinationNumber, AtomicNumber=SubstructureSearchLib.Atom3DPropertyConstraint.AtomicNumber, VdwRadius=SubstructureSearchLib.Atom3DPropertyConstraint.VdwRadius, CovalentRadius=SubstructureSearchLib.Atom3DPropertyConstraint.CovalentRadius, ) points = self._args_to_points(1, args, require_3d=False) which = _which_dic.prefix_lookup(kw['which']) constraint = SubstructureSearchLib.Atom3DPropertyConstraint( points[0], which, SubstructureSearchLib.AlwaysTrue(), name ) self.measurements[name] = constraint self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(constraint))
[docs] def add_constant_value_measurement(self, name, value): '''Add a constant value. :param name: the name by which this constant will be accessed. :param value: a float. >>> query = SubstructureSearch() >>> substructure = QuerySubstructure() >>> _ = substructure.add_atom(['C', 'N']) >>> _ = query.add_substructure(substructure) >>> query.add_constant_value_measurement('PI', 3.14159) ''' constraint = SubstructureSearchLib.ConstantValueConstraint(value) constraint.set_label(name) self.measurements[name] = constraint self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(constraint))
[docs] def add_unary_transform_measurement(self, name, which, arg): '''Add a mathematical operation. :param name: name by which the result will be accessed. :param which: one of 'ABS', 'LOG', 'LOG10', 'EXP', 'COS', 'SIN', 'TAN', 'ACOS', 'ASIN', 'ATAN', 'FLOOR', 'ROUND', 'SQRT', 'NEG'. :param arg: the name of the measurement or constraint to which to apply the function. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_vector('VEC1', (0, 1), (1, 2)) >>> query.add_vector('VEC2', (0, 2), (1, 1)) >>> query.add_vector_angle_measurement('ANG1', 'VEC1', 'VEC2') >>> query.add_unary_transform_measurement('ABS_ANGLE', 'ABS', 'ANG1') ''' _unary_transform_dic = utilities.bidirectional_dict( ABS=SubstructureSearchLib.UnaryTransformConstraint.ABS, LOG=SubstructureSearchLib.UnaryTransformConstraint.LOG, LOG10=SubstructureSearchLib.UnaryTransformConstraint.LOG10, EXP=SubstructureSearchLib.UnaryTransformConstraint.EXP, COS=SubstructureSearchLib.UnaryTransformConstraint.COS, SIN=SubstructureSearchLib.UnaryTransformConstraint.SIN, TAN=SubstructureSearchLib.UnaryTransformConstraint.TAN, ACOS=SubstructureSearchLib.UnaryTransformConstraint.ACOS, ASIN=SubstructureSearchLib.UnaryTransformConstraint.ASIN, ATAN=SubstructureSearchLib.UnaryTransformConstraint.ATAN, FLOOR=SubstructureSearchLib.UnaryTransformConstraint.INT, ROUND=SubstructureSearchLib.UnaryTransformConstraint.NINT, SQRT=SubstructureSearchLib.UnaryTransformConstraint.SQRT, NEG=SubstructureSearchLib.UnaryTransformConstraint.NEG, #ASSI=SubstructureSearchLib.UnaryTransformConstraint.ASSI ) kind = _unary_transform_dic.prefix_lookup(which) c = self.measurements.get(arg, self.constraints.get(arg, self.contacts.get(arg, None))) if c is None: raise TypeError('No measurement or constraint for %s' % arg) constraint = SubstructureSearchLib.UnaryTransformConstraint( kind, c, SubstructureSearchLib.AlwaysTrue(), name ) self.measurements[name] = constraint self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(constraint))
[docs] def add_binary_transform_measurement(self, name, which, arg1, arg2): '''Add a binary mathematical operation. :param name: the name by which this value will be accessed. :param which: one of 'MAX', 'MIN', 'ADD', 'SUBTRACT', 'MULTIPLY', 'DIVIDE', 'POW', 'RSIN', 'RCOS'. :param arg1, arg2: the name of a measurement to be used as arguments to the operator. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_vector('VEC1', (0, 1), (1, 2)) >>> query.add_vector('VEC2', (0, 2), (1, 1)) >>> query.add_vector_angle_measurement('ANG1', 'VEC1', 'VEC2') >>> query.add_constant_value_measurement('D2R', 180/3.14159) >>> query.add_binary_transform_measurement('IN_RADIANS', 'MUL', 'ANG1', 'D2R') ''' _binary_transform_dic = utilities.bidirectional_dict( MAX=SubstructureSearchLib.BinaryTransformConstraint.MAX, MIN=SubstructureSearchLib.BinaryTransformConstraint.MIN, ADD=SubstructureSearchLib.BinaryTransformConstraint.ADD, #SUM=SubstructureSearchLib.BinaryTransformConstraint.SUM, ??? SUBTRACT=SubstructureSearchLib.BinaryTransformConstraint.SUBTRACT, MULTIPLY=SubstructureSearchLib.BinaryTransformConstraint.MULTIPLY, DIVIDE=SubstructureSearchLib.BinaryTransformConstraint.DIVIDE, POW=SubstructureSearchLib.BinaryTransformConstraint.POW, RSIN=SubstructureSearchLib.BinaryTransformConstraint.RSIN, RCOS=SubstructureSearchLib.BinaryTransformConstraint.RCOS ) kind = _binary_transform_dic.prefix_lookup(which) c1 = self.measurements.get(arg1, self.constraints.get(arg1, self.contacts.get(arg1, None))) if c1 is None: raise TypeError('No measurement or constraint for %s' % arg1) c2 = self.measurements.get(arg2, self.constraints.get(arg2, self.contacts.get(arg2, None))) constraint = SubstructureSearchLib.BinaryTransformConstraint( kind, c1, c2, SubstructureSearchLib.AlwaysTrue(), name ) self.measurements[name] = constraint self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(constraint))
def _add_constraint(self, name, r): constraint = self.measurements.pop(name) crit = _decode_condition(r) constraint.set_condition(crit) self.constraints[name] = constraint self._motif.add_object_constraint(constraint) ##### Constraints #####
[docs] def add_distance_constraint(self, name, *args, **kw): '''Add a distance constraint. :param name: the name of this constraint. :param `*args`: specifications of points either as pairs (substructure_index, atom_index) or as names of geometric measurements. :param range: a condition, either as a pair of floats or a pair (operator, value) where operator may be - '==', '>', '<', '>=', '<=', '!=' or a pair ('in', list(values)). :param intermolecular: whether or not the distance should be within a unit cell molecule or between a unit cell molecule and a packing shell molecule. :param vdw_corrected: whether the distance range should be relative to the Van der Waals radii of the atoms involved. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_distance_constraint('DIST1', (0, 1), (1, 1), (-5, 0), vdw_corrected=True, type='any') >>> query.add_distance_constraint('DIST2', (0, 2), (1, 2), ('<=', 3.0), vdw_corrected=True, type='any') ''' kind = kw.get('type', 'inter') vdw_corrected = kw.get('vdw_corrected', False) off = 1 if isinstance(args[-off], bool): vdw_corrected = args[-off] off += 1 if isinstance(args[-off], str): kind = args[-off] off += 1 if kind.lower().startswith('intra'): which = ChemistryLib.ContactCriterion.INTRAMOLECULAR elif kind.lower().startswith('any'): which = ChemistryLib.ContactCriterion.ANY else: which = ChemistryLib.ContactCriterion.INTERMOLECULAR r = args[-off] points = self._args_to_points(2, args[:-off]) if isinstance(r, (list, tuple)): if isinstance(r[0], str) or any(isinstance(a, str) for a in args[:-off]): cond = _decode_condition(r) constraint = SubstructureSearchLib.PointDistanceConstraint(points[0], points[1], cond, name) self._motif.add_object_constraint(constraint) self.constraints[name] = constraint self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(constraint)) else: crit = SubstructureSearchLib.InterAtomicDistanceCriterion(min(r), max(r), which, vdw_corrected) crit.set_min_path_length(3) crit.set_max_path_length(999) i = 0 while i < len(args): a = args[i] if isinstance(a, (list, tuple)): if i == 0: sub_inx1 = a[0] at_inx1 = a[1] i += 1 else: sub_inx2 = a[0] at_inx2 = a[1] break elif isinstance(a, int): if i == 0: sub_inx1 = a at_inx1 = args[i+1] i += 2 else: sub_inx2 = a at_inx2 = args[i+1] break constraint = SubstructureSearchLib.SubstructureContact(sub_inx1, at_inx1, sub_inx2, at_inx2, crit) self._motif.add_contact(constraint) self.contacts[name] = constraint self._motif.add_motif_parameter(MotifSearchLib.MotifDistanceParameter(name, sub_inx1, at_inx1, sub_inx2, at_inx2)) else: raise TypeError('Invalid value for condition {}'.format(r))
[docs] def add_angle_constraint(self, name, *args): '''Add an angle constraint. :param name: by which the constraint will be accessed. :param `*args`: three instances either of a pair (substructure_index, atom_index) or of names of geometric objects. :param range: as for :meth:`ccdc.search.SubstructureSearch.add_distance_constraint` >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_centroid('CENT1', (0, 0), (0, 1), (0, 2)) >>> query.add_centroid('CENT2', (1, 0), (1, 1), (1, 2)) >>> query.add_angle_constraint('ANG1', (0, 0), (1, 1), (1, 0), ('>=', 120)) ''' self.add_angle_measurement(name, *args[:-1]) self._add_constraint(name, args[-1])
[docs] def add_torsion_angle_constraint(self, name, *args): '''Add a torsion angle constraint. :param name: the name by which this constraint is accessed. :param `*args`: as for :meth:`ccdc.search.SubstructureSearch.add_distance_constraint` >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_centroid('CENT1', (0, 0), (0, 1), (0, 2)) >>> query.add_centroid('CENT2', (1, 0), (1, 1), (1, 2)) >>> query.add_torsion_angle_constraint('ANG1', (0, 0), (0, 1), (1, 1), (1, 0), (120, 180)) ''' self.add_torsion_angle_measurement(name, *args[:-1]) self._add_constraint(name, args[-1])
[docs] def add_vector_angle_constraint(self, name, *args): '''Add a vector angle constraint. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_vector('VEC1', (0, 1), (1, 2)) >>> query.add_vector('VEC2', (0, 2), (1, 1)) >>> query.add_vector_angle_constraint('ANG1', 'VEC1', 'VEC2', (0, 60)) ''' self.add_vector_angle_measurement(name, *args[:-1]) self._add_constraint(name, args[-1])
[docs] def add_plane_angle_constraint(self, name, *args): '''Add a plane angle constraint. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_plane('PLANE1', (0, 0), (0, 1), (0, 2)) >>> query.add_plane('PLANE2', (1, 0), (1, 1), (1, 2)) >>> query.add_plane_angle_constraint('PA1', 'PLANE1', 'PLANE2', (-10, 10)) ''' self.add_plane_angle_measurement(name, *args[:-1]) self._add_constraint(name, args[-1])
[docs] def add_point_plane_distance_constraint(self, name, *args): '''Add a point plane distance constraint. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_centroid('CENT1', (0, 0), (0, 1), (0, 2)) >>> query.add_plane('PLANE2', (1, 0), (1, 1), (1, 2)) >>> query.add_point_plane_distance_constraint('PP1', 'CENT1', 'PLANE2', ('<', 5)) ''' self.add_point_plane_distance_measurement(name, *args[:-1]) self._add_constraint(name, args[-1])
[docs] def add_vector_plane_angle_constraint(self, name, *args): '''Add a vector plane angle constraint. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_vector('VEC1', (0, 1), (1, 2)) >>> query.add_plane('PLANE2', (1, 0), (1, 1), (1, 2)) >>> query.add_vector_plane_angle_constraint('ANG1', 'VEC1', 'PLANE2', ('>', 90)) ''' self.add_vector_plane_angle_measurement(name, *args[:-1]) self._add_constraint(name, args[-1])
[docs] def add_atom_property_constraint(self, name, *args, **kw): '''Add an atom property constraint. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('[*H1]')) >>> query.add_atom_property_constraint('ATOM1', (0, 0), ('in', [7, 8]), which='AtomicNumber') ''' self.add_atom_property_measurement(name, *args[:-1], **kw) self._add_constraint(name, args[-1])
[docs] def add_unary_transform_constraint(self, name, *args): '''Add an arithmetical calculation constraint. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_vector('VEC1', (0, 1), (1, 2)) >>> query.add_vector('VEC2', (0, 2), (1, 1)) >>> query.add_vector_angle_measurement('ANG1', 'VEC1', 'VEC2') >>> query.add_unary_transform_constraint('ABS_ANGLE', 'ABS', 'ANG1', (0, 10)) ''' self.add_unary_transform_measurement(name, *args[:-1]) self._add_constraint(name, args[-1])
[docs] def add_binary_transform_constraint(self, name, which, *args): '''Add a binary arithmetical calculation constraint. >>> query = SubstructureSearch() >>> _ = query.add_substructure(SMARTSSubstructure('C(=O)O')) >>> _ = query.add_substructure(SMARTSSubstructure('N(-H)H')) >>> query.add_vector('VEC1', (0, 1), (1, 2)) >>> query.add_vector('VEC2', (0, 2), (1, 1)) >>> query.add_vector_angle_measurement('ANG1', 'VEC1', 'VEC2') >>> query.add_constant_value_measurement('D2R', 180/3.14159) >>> query.add_binary_transform_constraint('IN_RADIANS', 'MUL', 'ANG1', 'D2R', (-1, 1)) ''' self.add_binary_transform_measurement(name, which, args[0], args[1]) self._add_constraint(name, args[2])
[docs] @staticmethod def from_xml(xml): '''Create a substructure search from XML. Deprecated. :param xml: XML string ''' s = SubstructureSearch() s.read_xml(xml) return s
[docs] @staticmethod def from_xml_file(file_name): '''Create a substructure search from an XML file. Deprecated. :param file_name: path to XML file :raises: IOError when the file does not exist ''' if not os.path.exists(file_name): raise IOError('The file %s does not exist' % file_name) with open(file_name) as f: return SubstructureSearch.from_xml(f.read())
[docs] def read_xml(self, xml): '''Read search query from XML. Deprecated. :param xml: XML string ''' warnings.warn('''This method is deprecated and will be removed in a later version.''', DeprecationWarning) rdr = CSDSQLDatabaseLib.XMLMotifReader() stream = UtilitiesLib.istringstream(xml) rdr.load(stream) for i in range(rdr.nmolecules()): self._xml_motif = motif = rdr.motif(i) # merge this motif with ours for j in range(motif.nsubstructures()): self.add_substructure(QuerySubstructure(motif.substructure(j))) for j in range(motif.n_object_constraints()): obj = motif.object_constraint(j) if obj.class_name() == 'PointTorsionConstraint': c = obj.condition() p0 = SubstructureSearchLib.Object_as_Point(obj.objects(0)) p1 = SubstructureSearchLib.Object_as_Point(obj.objects(1)) p2 = SubstructureSearchLib.Object_as_Point(obj.objects(2)) p3 = SubstructureSearchLib.Object_as_Point(obj.objects(3)) obj = SubstructureSearchLib.PointTorsionConstraint( p0, p1, p2, p3, SubstructureSearchLib.AlwaysTrue(), obj.label() ) obj.set_condition(c) self.constraints[obj.label()] = obj self._motif.add_object_constraint(obj) self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(obj)) for j in range(motif.nconstraints()): con = motif.constraint(j) for f, tag in [ (MotifSearchLib.motif_constraint_as_angle_constraint, 'ANGLE'), (MotifSearchLib.motif_constraint_as_combined_constraint, 'COMBINED'), (MotifSearchLib.motif_constraint_as_contact_order_constraint, 'CONTACT_ORDER'), (MotifSearchLib.motif_constraint_as_discrete_chain_constraint, 'DISCRETE_CHAIN'), (MotifSearchLib.motif_constraint_as_hydrogen_bond_angle_present_constraint, 'HBOND_ANGLE_PRESENT'), (MotifSearchLib.motif_constraint_as_hydrogen_bond_constraint, 'HBOND'), (MotifSearchLib.motif_constraint_as_nunique_contacts_constraint, 'NUNIQUE_CONTACTS'), (MotifSearchLib.motif_constraint_as_not_present_constraint, 'NOT_PRESENT'), (MotifSearchLib.motif_constraint_as_shortest_path_constraint, 'SHORTEST_PATH'), (MotifSearchLib.motif_constraint_as_torsion_constraint, 'TORSION'), (MotifSearchLib.motif_constraint_as_translation_constraint, 'TRANSLATION'), (MotifSearchLib.motif_constraint_as_unique_atoms_constraint, 'UNIQUE_ATOMS'), ]: c = f(con) if c: self.constraints['%s_%d' % (tag, len(self.constraints)+1)] = c self._motif.add_motif_constraint(con) self._motif.add_motif_parameter(MotifSearchLib.MotifGeometricConstraintParameter(con)) for j in range(motif.ncontacts()): con = motif.motif_contact(j) self.contacts['CONTACT_%d' % j] = con self._motif.add_contact(con) s1 = con.substruct_a() a1 = con.atom_a() s2 = con.substruct_b() a2 = con.atom_b() self._motif.add_motif_parameter(MotifSearchLib.MotifDistanceParameter('CONTACT_%d' % j, s1, a1, s2, a2)) for j in range(motif.nparameters()): par = motif.parameter(i) self._motif.add_parameter(par) for f, tag in [ (MotifSearchLib.motif_parameter_as_distance_parameter, 'DISTANCE'), (MotifSearchLib.motif_parameter_as_angle_parameter, 'ANGLE'), (MotifSearchLib.motif_parameter_as_torsion_parameter, 'TORSION') ]: p = f(par) if p: self.measurements[p.name()] = p self._motif.add_motif_parameter(p)
[docs] def read_xml_file(self, file_name): '''Read search parameters from an XML file. Deprecated. :param file_name: path to XML file :raises: IOError if the file cannot be read ''' if not os.path.exists(file_name): raise IOError('The file %s does not exist' % file_name) with open(file_name) as f: self.read_xml(f.read())
def _add_enantiomer_consistency(self): '''Apply the enantiomer search setting to the motif''' MotifSearchLib.MotifTorsionInversionConsistencyConstraint.add_to_motif(self._motif, self.settings._match_enantiomers) def _search_reader(self, database): if self._motif.nsubstructures() == 0: raise TypeError('No substructures to search') self._add_enantiomer_consistency() if not hasattr(database, '_motif_searcher'): try: database._motif_searcher = database._db.searcher_factory().motif_searcher() except (RuntimeError, NameError, AttributeError): pass if hasattr(database, '_motif_searcher'): results_writer = CSDSQLDatabaseLib.CrystalStructureDatabaseSearchVectorResultsWriter() hits = database._motif_searcher.search(self._motif, self.settings._settings, results_writer) hit_list = SubstructureSearch.SubstructureHitList( SubstructureSearch.SubstructureHit._from_match(x, self, _binary_database=database._db) for x in results_writer.matches() ) if database.__class__.__name__ == 'MoleculeReader': def skip_suppressed_hit(hit): try: ats = hit.match_atoms() return False except RuntimeError: return True hit_list = SubstructureSearch.SubstructureHitList( hit for hit in hit_list if not skip_suppressed_hit(hit) ) return hit_list else: # there used to be fallback code here, but now we expect to always support motif search on any database raise NotImplementedError("Substructure search is not implemented on this database type") def _search_entry(self, entry, _database=None): if self._motif.nsubstructures() == 0: raise TypeError('No substructures to search') if not CSDSQLDatabaseLib.test_entry_settings_constraints(self.settings._settings, entry._entry): return [] self._add_enantiomer_consistency() if entry._entry.chemical_diagram_views() is None: generator = ChemistryLib.ChemicalDiagramGenerator() diagram = generator.create_chemical_diagram(entry.molecule._molecule) views = ChemistryLib.ChemicalDiagramViews2D(diagram) entry._entry.set_chemical_diagram_views(views) results_writer = CSDSQLDatabaseLib.CrystalStructureDatabaseSearchVectorResultsWriter() csd = EntryReader('csd') if hasattr(csd, '_component_dbs'): db = csd._component_dbs.values()[-1] else: db = csd._db _substructure_searcher = db.searcher_factory().substructure_searcher() _substructure_searcher = CSDSQLDatabaseLib.CSDSQLSubstructureSearcher(_substructure_searcher) if CSDSQLDatabaseLib.CSDSQLSubstructureSearcher_valid(_substructure_searcher): _substructure_searcher.search_entry( entry._entry, self._motif, self.settings._settings, results_writer ) return SubstructureSearch.SubstructureHitList( SubstructureSearch.SubstructureHit._from_match(x, self, _entry=entry) for x in results_writer.matches() ) else: return self._search_crystal(entry.crystal, _database=_database) def _search_crystal(self, crystal, _database=None): if self._motif.nsubstructures() == 0: raise TypeError('No substructures to search') try: if not CSDSQLDatabaseLib.test_molecule_settings_constraints(self.settings._settings, crystal.molecule._molecule): return [] except TypeError: if self.settings.has_3d_coordinates: return [] self._add_enantiomer_consistency() view = ChemistryLib.CrystalStructureView.instantiate(crystal._crystal) searcher = MotifSearchLib.MotifSearch() searcher.set_limit(self.settings.max_hits_per_structure) try: res = searcher.search(view, self._motif) except RuntimeError as e: if 'Too many steps' in str(e): raise RuntimeError( 'The crystal search failed: probably due to an over-complex substructure or target molecule.\n' 'Try restricting the number of hits with max_hits_per_structure or reducing the complexity ' 'of the substructure.' ) else: raise RuntimeError('Crystal search failed with: %s' % e) if _database is not None: _crystal = None else: _crystal = crystal hits = SubstructureSearch.SubstructureHitList( SubstructureSearch.SubstructureHit( crystal.identifier, m, searcher.search_structure(), self, _crystal=_crystal, _database=_database) for m in res) for h in hits: h._crystal = crystal return hits def _search_molecule(self, molecule, _database=None): if not CSDSQLDatabaseLib.test_molecule_settings_constraints(self.settings._settings, molecule._molecule): return [] if hasattr(molecule, '_cell'): _cell = molecule._cell else: _cell = None self._add_enantiomer_consistency() molecule._cell = ChemistryLib.Cell() c = Entry.from_molecule(molecule).crystal c._crystal.set_cell(ChemistryLib.Cell(), ChemistryLib.CrystalStructure.KEEP_ORTHOGONAL_COORDINATES) ret = self._search_crystal(c, _database=_database) if _cell is not None: molecule._cell = _cell for h in ret: h._molecule = molecule h._crystal = None return ret
########################################################################### # Reduced cell search ###########################################################################
[docs]class ReducedCellSearch(Search): '''Provide reduced cell searches.'''
[docs] @utilities.nested_class('ReducedCellSearch') class Settings(Search.Settings): '''Settings appropriate to a reduced cell search.''' def __init__(self, _settings=None): '''Initialis settings.''' if _settings is None: self._settings = CSDSQLDatabaseLib.CrystalStructureDatabaseReducedCellSearchSettings() Search.Settings.__init__(self, self._settings) self.max_hits_per_structure = 1 else: self._settings = _settings
[docs] def reset(self): '''Reset to default values.''' self._settings = CSDSQLDatabaseLib.CrystalStructureDatabaseReducedCellSearchSettings() Search.Settings.__init__(self, self._settings) self.max_hits_per_structure = 1
@property def percent_length_tolerance(self): '''The cell length tolerance as a percentage of the longest cell dimension.''' return self._settings.percent_length_tolerance() @percent_length_tolerance.setter def percent_length_tolerance(self, val): '''Set the percent length tolerance.''' self._settings.set_percent_length_tolerance(val) @property def absolute_angle_tolerance(self): '''The absolute angle tolerance.''' return self._settings.absolute_angle_tolerance() @absolute_angle_tolerance.setter def absolute_angle_tolerance(self, val): '''Set the absolute angle tolerance.''' self._settings.set_absolute_angle_tolerance(val) @property def is_normalised(self): '''Whether the input cell is normalised.''' return self._settings.is_normalised() @is_normalised.setter def is_normalised(self, val): '''Set the is_normalised property.''' self._settings.set_is_normalised(val)
[docs] @utilities.nested_class('ReducedCellSearch') class Query(object): '''Base query.''' def __init__(self, lengths=None, angles=None, lattice_centring=None): '''Initialise with cell lengths, cell angles and the lattice centring.''' self.lengths = lengths self.angles = angles self.lattice_centring = lattice_centring def _get_query(self, settings=None): '''Private: return an internal query object.''' if settings is None: settings = ReducedCellSearch.Settings() if isinstance(self.lattice_centring, str): centring = self.lattice_centring else: etm = ChemistryLib.Spacegroup.centring_text() centring = etm.text(self.lattice_centring) k = centring[0].upper() if k == 'R': sp = ChemistryLib.Spacegroup('R3', ChemistryLib.Spacegroup.UNKNOWN_SYSTEM) else: sp = ChemistryLib.Spacegroup(k + '1') cell = ChemistryLib.Cell( self.lengths[0], self.lengths[1], self.lengths[2], MathsLib.Angle(self.angles[0], MathsLib.Angle.DEGREES), MathsLib.Angle(self.angles[1], MathsLib.Angle.DEGREES), MathsLib.Angle(self.angles[2], MathsLib.Angle.DEGREES), sp ) return CSDSQLDatabaseLib.CrystalStructureDatabaseReducedCellSearch( cell, settings._settings )
[docs] @utilities.nested_class('ReducedCellSearch') class CrystalQuery(Query): '''Reduced cell query from a crystal.''' def __init__(self, crystal): super(ReducedCellSearch.CrystalQuery, self).__init__( crystal.cell_lengths, crystal.cell_angles, crystal.lattice_centring )
[docs] @utilities.nested_class('ReducedCellSearch') class XMLQuery(Query): '''Reduced cell query from an XML representation.''' def __init__(self, xml): '''Initialise from xml. :param xml: XML string ''' parser = CSDSQLDatabaseLib.ReducedCellSearchXMLParser() self._query = parser.parse(xml) def _get_query(self, settings=None): '''Private: return the underlying internal query.''' return self._query
[docs] @utilities.nested_class('ReducedCellSearch') class XMLFileQuery(XMLQuery): '''Reduced cell query from a file name.''' def __init__(self, file_name): '''Initialise from a file name.''' with open(file_name) as f: super(ReducedCellSearch.XMLFileQuery, self).__init__(f.read())
def __init__(self, query=None, settings=None): '''Initialise with optional query and settings.''' self.query = query if settings is None: settings = self.Settings() self.settings = settings
[docs] @staticmethod def from_xml(xml): '''Construct a reduced cell search from an XML representation. :param xml: XML string ''' stream = UtilitiesLib.istringstream(xml) parser = CSDSQLDatabaseLib.ReducedCellSearchXMLParser() q = parser.parse(stream) rcs = ReducedCellSearch(query=ReducedCellSearch.XMLQuery(xml)) rcs.settings.percent_length_tolerance = q.settings().percent_length_tolerance() rcs.settings.absolute_angle_tolerance = q.settings().absolute_angle_tolerance() return rcs
[docs] @staticmethod def from_xml_file(file_name): '''Construct a reduced cell search from an XML file. :param file_name: path to XML file :raises: IOError when the file does not exist ''' if not os.path.exists(file_name): raise IOError('The file %s does not exist' % file_name) with open(file_name) as f: return ReducedCellSearch.from_xml(f.read())
[docs] def read_xml(self, xml): '''Read XML into this ReducedCellSearch. :param xml: XML string ''' self.set_query(ReducedCellSearch.XMLQuery(xml)) parser = CSDSQLDatabaseLib.ReducedCellSearchXMLParser() q = parser.parse(xml) self.settings.percent_length_tolerance = q.settings().percent_length_tolerance() self.settings.absolute_angle_tolerance = q.settings().absolute_angle_tolerance()
[docs] def read_xml_file(self, file_name): '''Read an XML file into this ReducedCellSearch. :param file_name: path to XML file :raises: IOError if the file cannot be read ''' if not os.path.exists(file_name): raise IOError('The file %s does not exist' % file_name) with open(file_name) as f: self.read_xml(f.read())
[docs] def set_query(self, query): '''Set the query.''' self.query = query self._search = self.query._get_query(self.settings)
[docs] def compare_cells(self, r0, r1): '''Compare two reduced cells. :param r0: the first reduced cell, an instance of :class:`ccdc.crystal.Crystal.ReducedCell` :param r1: the second reduced cell similarly :returns: boolean ''' if isinstance(r0, Crystal.ReducedCell): r0 = r0._reduced_cell if isinstance(r1, Crystal.ReducedCell): r1 = r1._reduced_cell len_tol = (self.settings.percent_length_tolerance/100.) * max(r0.a(), r0.b(), r0.c()) ang_tol = self.settings.absolute_angle_tolerance def _compare_values(v0, v1, tol): '''Private: test value difference lies within tolerance.''' return abs(v0 - v1) <= tol return ( _compare_values(r0.a(), r1.a(), len_tol) and _compare_values(r0.b(), r1.b(), len_tol) and _compare_values(r0.c(), r1.c(), len_tol) and _compare_values(r0.alpha().degrees(), r1.alpha().degrees(), ang_tol) and _compare_values(r0.beta().degrees(), r1.beta().degrees(), ang_tol) and _compare_values(r0.gamma().degrees(), r1.gamma().degrees(), ang_tol) )
def _search_reader(self, database): '''Search a database.''' if not self.query: raise TypeError('The search has no query.') if not hasattr(database, '_reduced_cell_searcher'): try: database._reduced_cell_searcher = database._db.searcher_factory().reduced_cell_searcher() except (RuntimeError, NameError, AttributeError): pass if hasattr(database, '_reduced_cell_searcher'): if self.settings._has_filter_set(): max_hits = self.settings.max_hit_structures if max_hits != maxint32: self.settings.max_hit_structures = maxint32 hits = database._reduced_cell_searcher.search(self.query._get_query(self.settings)) ret = list() for h in hits: r = Search.SearchHit(h, _database=database) if self.settings.test(r.entry): ret.append(r) if max_hits and len(ret) >= max_hits: break self.settings.max_hit_structures = max_hits else: hits = database._reduced_cell_searcher.search(self.query._get_query(self.settings)) ret = [Search.SearchHit(h, _database=database) for h in hits] else: # Have to do it one-by-one ret = [] for c in database.crystals(): if self.settings._has_filter_set() and self.settings.test(c): ret.extend(self._search_crystal(c)) else: ret.extend(self._search_crystal(c)) if self.settings.max_hit_structures and len(ret) > self.settings.max_hit_structures: break for r in ret: r._crystal = None r._database = database return ret def _search_molecule(self, mol): '''Molecules don't have cells, so always fails.''' return [] def _search_crystal(self, crystal): '''Test the query against a single crystal.''' red = ChemistryLib.ReducedCell(crystal._crystal.cell()) que = self.query._get_query(self.settings) que_red = que.query_cell() if self.compare_cells(que_red, red): ret = [Search.SearchHit(crystal.identifier)] ret[0]._crystal = crystal if self.settings._has_filter_set(): if self.settings.test(crystal): return ret else: return [] return ret return []
########################################################################### # Combined search ###########################################################################
[docs]class CombinedSearch(Search): '''Boolean combinations of other searches. TextNumericSearch, SubstructureSearch, SimilaritySearch and ReducedCellSearch can be combined using and, or and not to provide a combined search. >>> csd = io.EntryReader('csd') >>> tns = TextNumericSearch() >>> tns.add_compound_name('Aspirin') >>> sub_search = SubstructureSearch() >>> _ = sub_search.add_substructure(SMARTSSubstructure('C(=O)OH')) >>> rcs = ReducedCellSearch(ReducedCellSearch.CrystalQuery(csd.crystal('ACSALA'))) >>> combi_search = CombinedSearch(tns & (-rcs | -sub_search)) >>> hits = combi_search.search() >>> print(len(hits)) 89 '''
[docs] class Settings(Search.Settings): '''Settings appropriate to a combined search.''' def __init__(self): super(self.__class__, self).__init__()
[docs] class CombinedHit(Search.SearchHit): '''A hit from a combined search.''' def __init__(self, identifier, _database=None, _entry=None, _crystal=None, _molecule=None): super(self.__class__, self).__init__(identifier, _database=_database, _entry=_entry, _crystal=_crystal, _molecule=_molecule) self.measurements = dict() self.constraints = dict() self.geometric_objects = dict() self.similarities = dict() self._subhits = list() @staticmethod def _from_similarity_hit(identifier, comparators, similarities, _database=None): '''Make a CombinedHit from a SimilarityHit.''' hit = CombinedSearch.CombinedHit(identifier, _database=_database) hit.similarities.update({ comp : float(sim) for comp, sim in zip(comparators, similarities) }) return hit @staticmethod def _from_search_hit(search_hit): '''Make a CombinedHit from a TextNumericSearch or a ReducedCellSearch.''' return CombinedSearch.CombinedHit(search_hit.identifier, search_hit._database) @staticmethod def _from_substructure_hit(sub_hit, _database): '''Make a CombinedHit from a SubstructureSearchHit.''' hit = CombinedSearch.CombinedHit(sub_hit.identifier, _database) hit.measurements.update(sub_hit.measurements) hit.constraints.update(sub_hit.constraints) hit.geometric_objects.update(sub_hit.geometric_objects) sub_hit._database = _database sub_hit._entry = sub_hit._crystal = sub_hit._molecule = None hit._subhits.append(sub_hit) return hit def _merge(self, hit): '''Merge another hit into here.''' self.measurements.update(hit.measurements) self.constraints.update(hit.constraints) self.geometric_objects.update(hit.geometric_objects) self.similarities.update(hit.similarities) self._subhits.extend(hit._subhits) def copy(self): hit = CombinedSearch.CombinedHit(self.identifier, _database=self._database) hit.measurements.update(self.measurements) hit.constraints.update(self.constraints) hit.geometric_objects.update(self.geometric_objects) hit.similarities.update(self.similarities) hit._subhits = self._subhits[:] return hit def measurement_atoms(self, name): hs = [h for h in self._subhits if name in h.measurements] if hs: return hs[-1].measurement_atoms[name] def constraint_atoms(self, name): hs = [h for h in self._subhits if name in h.constraints] if hs: return hs[-1].constraint_atoms[name] def _geometric_object_atoms(self, name): hs = [h for h in self._subhits if name in h.geometric_objects] if hs: return hs[-1].geometric_objects[name] def centroid_atoms(self, name): return self._geometric_object_atoms(name) def dummy_point_atoms(self, name): return self._geometric_object_atoms(name) def group_atoms(self, name): return self._geometric_object_atoms(name) def vector_atoms(self, name): return self._geometric_object_atoms(name) def plane_atoms(self, name): return self._geometric_object_atoms(name) def match_components(self): return [m for h in self._subhits for m in h.match_components()] def match_atoms(self, indices=False): return [a for h in self._subhits for a in h.match_atoms(indices=indices)] def match_substructures(self): return [m for h in self._subhits for m in h.match_components()] def match_symmetry_operators(self): return [m for h in self._subhits for m in h.match_symmetry_operators()]
def __init__(self, expression, settings=None): if settings is None: settings = CombinedSearch.Settings() self.settings = settings self._node = self._make_node(expression) self._searcher_dict = self._node._searcher_dict self._limit_dict = self._node._limit_dict #if not hasattr(TextNumericSearch, '__neg__'): # self._monkey_patch() #self._searcher_dict = dict() #self._limit_dict = dict() @staticmethod def _make_node(other): '''Private: create a combined search node.''' if isinstance(other, TextNumericSearch): _node = CSDSQLDatabaseLib.TextNumericSearchNode(other._search) _node._searcher_dict = collections.OrderedDict([(str(other), other)]) _node._limit_dict = {} elif isinstance(other, SubstructureSearch): _node = CSDSQLDatabaseLib.MotifNode( CSDSQLDatabaseLib.pair_motif_settings(other._motif, other.settings._settings) ) adder = CSDSQLDatabaseLib.MatchStringDataItemAdder('substructure_search', str(other)) _node = CSDSQLDatabaseLib.MatchMutatorNode(adder, _node) _node._searcher_dict = collections.OrderedDict([(str(other), other)]) _node._limit_dict = {} elif isinstance(other, ReducedCellSearch): _node = CSDSQLDatabaseLib.ReducedCellNode(other.query._get_query(other.settings)) _node._searcher_dict = collections.OrderedDict([(str(other), other)]) _node._limit_dict = {} elif isinstance(other, SimilaritySearch): _node = CSDSQLDatabaseLib.SimilaritySearchNode(CSDSQLDatabaseLib.pair_substructure_simsettings(other._substructure, other.settings._settings)) adder = CSDSQLDatabaseLib.MatchStringDataItemAdder('similarity_search', str(other)) _node = CSDSQLDatabaseLib.MatchMutatorNode(adder, _node) _node._searcher_dict = collections.OrderedDict([(str(other), other)]) _node._limit_dict = {} #elif isinstance(other, FormulaSearch): # _node = CSDSQLDatabaseLib.FormulaSearchNode(other._search) elif isinstance(other, CSDSQLDatabaseLib.Node): _node = other _node._searcher_dict = other._searcher_dict.copy() _node._limit_dict = other._limit_dict.copy() else: raise TypeError('Not appropriate for a combined search %s', type(other)) return _node def __and__(self, other): '''Conjoin this with another search.''' _node = self._make_node(other) if self._node is None: self._node = _node else: self._node = CSDSQLDatabaseLib.AndNode(self._node, _node) self._searcher_dict.update(_node._searcher_dict) self._limit_dict.update(_node._limit_dict) return self def __iand__(self, other): '''In-place conjunction.''' _node = self._make_node(other) if self._node is None: self._node = _node else: self._node = CSDSQLDatabaseLib.AndNode(self._node, _node) self._searcher_dict.update(_node._searcher_dict) self._limit_dict.update(_node._limit_dict) def __or__(self, other): '''Disjoin this with another search.''' _node = self._make_node(other) if self._node is None: self._node = _node else: self._node = CSDSQLDatabaseLib.OrNode(self._node, _node) self._searcher_dict.update(_node._searcher_dict) self._limit_dict.update(_node._limit_dict) return self def __ior__(self, other): '''In-place disjunction.''' _node = self._make_node(other) if self._node is None: self._node = _node else: self._node = CSDSQLDatabaseLib.OrNode(self._node, _node) self._searcher_dict.update(_node._searcher_dict) self._limit_dict.update(_node._limit_dict) def __neg__(self): '''Negate this search.''' if self._node is None: raise TypeError('No searches to negate') _node = CSDSQLDatabaseLib.NotNode(self._node) _node._searcher_dict = self._node._searcher_dict _node._limit_dict = self._node._limit_dict self._node = _node return self @staticmethod def _monkey_patch(extra=None): '''Private: ensure relevant classes have combination methods.''' def negate(s): _node = CombinedSearch._make_node(s) ret = CSDSQLDatabaseLib.NotNode(_node) ret._searcher_dict = _node._searcher_dict ret._limit_dict = _node._limit_dict return ret def conjoin(s, t): _s = CombinedSearch._make_node(s) _t = CombinedSearch._make_node(t) ret = CSDSQLDatabaseLib.AndNode(_s, _t) ret._searcher_dict = _s._searcher_dict.copy() ret._searcher_dict.update(_t._searcher_dict) ret._limit_dict = _s._limit_dict ret._limit_dict.update(_t._limit_dict) return ret def disjoin(s, t): _s = CombinedSearch._make_node(s) first_mutator = CSDSQLDatabaseLib.MatchStringDataItemAdder('disjunct', 'first') _sm = CSDSQLDatabaseLib.MatchMutatorNode(first_mutator, _s) _sm._searcher_dict = _s._searcher_dict _sm._limit_dict = _s._limit_dict _t = CombinedSearch._make_node(t) second_mutator = CSDSQLDatabaseLib.MatchStringDataItemAdder('disjunct', 'second') _tm = CSDSQLDatabaseLib.MatchMutatorNode(second_mutator, _t) _tm._searcher_dict = _t._searcher_dict _tm._limit_dict = _t._limit_dict ret = CSDSQLDatabaseLib.OrNode(_sm, _tm) ret._searcher_dict = _s._searcher_dict.copy() ret._searcher_dict.update(_t._searcher_dict) ret._limit_dict = _s._limit_dict ret._limit_dict.update(_t._limit_dict) return ret if extra is None: extra = [] for cl in ( TextNumericSearch, SimilaritySearch, SubstructureSearch, ReducedCellSearch, CSDSQLDatabaseLib.AndNode, CSDSQLDatabaseLib.OrNode, CSDSQLDatabaseLib.NotNode, CSDSQLDatabaseLib.EntryLimitNode, CSDSQLDatabaseLib.MatchMutatorNode, ) + tuple(extra): cl.__neg__ = negate cl.__and__ = conjoin cl.__or__ = disjoin @staticmethod def _which_node(node): possibilities = [ CSDSQLDatabaseLib.Node_as_NotNode, CSDSQLDatabaseLib.Node_as_AndNode, CSDSQLDatabaseLib.Node_as_OrNode, CSDSQLDatabaseLib.Node_as_MatchMutatorNode, CSDSQLDatabaseLib.Node_as_TextNumericSearchNode, CSDSQLDatabaseLib.Node_as_MotifNode, CSDSQLDatabaseLib.Node_as_ReducedCellNode, #CSDSQLDatabaseLib.Node_as_FormulaSearchNode, CSDSQLDatabaseLib.Node_as_SimilaritySearchNode, CSDSQLDatabaseLib.Node_as_EntryLimitNode, ] for p in possibilities: n = p(node) if n is not None: return n else: raise NotImplementedError('Unknown Node type %s' % type(node)) #@staticmethod def _show_node(self, n, indent=0): '''For debugging.''' _n = CombinedSearch._which_node(n) if isinstance(_n, CSDSQLDatabaseLib.NotNode): s = '%sNot(\n%s\n%s)' % (' '*indent, self._show_node(_n.child(), indent+2), ' '*indent) elif isinstance(_n, CSDSQLDatabaseLib.OrNode): s = '%sOr(\n%s,\n%s\n%s)' % (' '*indent, self._show_node(_n.left(), indent+2), self._show_node(_n.right(), indent+2), ' '*indent) elif isinstance(_n, CSDSQLDatabaseLib.AndNode): s = '%sAnd(\n%s,\n%s\n%s)' % (' '*indent, self._show_node(_n.left(), indent+2), self._show_node(_n.right(), indent+2), ' '*indent) elif isinstance(_n, CSDSQLDatabaseLib.MatchMutatorNode): mmm = CSDSQLDatabaseLib.Mutator_as_MatchStringDataItemAdder(_n.mutator()) if mmm.key() == 'similarity_search': ident = self._searcher_dict[mmm.value()].molecule.identifier else: ident = '' s = '%sMutate(%s=%s(%s)\n%s\n%s)' % (' '*indent, mmm.key(), mmm.value(), ident, self._show_node(_n.child(), indent+2), ' '*indent) elif isinstance(_n, CSDSQLDatabaseLib.EntryLimitNode): s = '%sLimit(\n%s\n%s)' % (' '*indent, self._show_node(_n.child(), indent+2), ' '*indent) elif isinstance(_n, CSDSQLDatabaseLib.TextNumericSearchNode): s = '%sText()' % (' '*indent) elif isinstance(_n, CSDSQLDatabaseLib.SimilaritySearchNode): s = '%sSimilarity()' % (' '*indent) elif isinstance(_n, CSDSQLDatabaseLib.MotifNode): s = '%sMotif()' % (' '*indent) elif isinstance(_n, CSDSQLDatabaseLib.ReducedCellNode): s = '%sReduced()' % (' '*indent) else: raise NotImplementedError('WTF? %s' % type(_n)) return s def _make_hits(self, match, node, _database=None, pars=None): '''The hits from an individual match.''' identifier = match.identifier().str() if pars is None: pars = match.data().parameters() n = self._which_node(node) if isinstance(n, CSDSQLDatabaseLib.NotNode): return [CombinedSearch.CombinedHit(identifier, _database=_database)] elif isinstance(n, CSDSQLDatabaseLib.OrNode): # Need to work out which disjunct, probably from a MatchMutator # Get the disjunct parameter if 'disjunct' not in pars: raise RuntimeError('No disjunct in pars') which = pars['disjunct'].pop() if which == 'first': left = self._which_node(n.left()) left = left.child() ret = self._make_hits(match, left, _database=_database, pars=pars) return ret else: right = self._which_node(n.right()) right = right.child() ret = self._make_hits(match, right, _database=_database, pars=pars) return ret elif isinstance(n, CSDSQLDatabaseLib.AndNode): # Cartesian product. left_hits = self._make_hits(match, n.left(), _database=_database, pars=pars) right_hits = self._make_hits(match, n.right(), _database=_database, pars=pars) result = [] for l in left_hits: for r in right_hits: h = l.copy() h._merge(r) result.append(h) return result elif isinstance(n, CSDSQLDatabaseLib.TextNumericSearchNode): return [CombinedSearch.CombinedHit(match.identifier().str(), _database=_database)] elif isinstance(n, CSDSQLDatabaseLib.ReducedCellNode): return [CombinedSearch.CombinedHit(match.identifier().str(), _database=_database)] elif isinstance(n, CSDSQLDatabaseLib.SimilaritySearchNode): # This won't happen - guarded by the MatchMutatorNode raise RuntimeError('SimilaritySearchNode: this cannot happen') elif isinstance(n, CSDSQLDatabaseLib.MatchMutatorNode): mmm = CSDSQLDatabaseLib.Mutator_as_MatchStringDataItemAdder(n.mutator()) if 'substructure_search' in pars: if mmm.key() == 'substructure_search': searcher = self._searcher_dict[mmm.value()] cry = _database.crystal(match.identifier().str()) hits = [CombinedSearch.CombinedHit._from_substructure_hit(h, _database) for h in searcher._search_crystal(cry)] return hits else: return self._make_hits(match, n.child(), _database=_database, pars=pars) elif 'similarity_search' in pars: if mmm.key() == 'similarity_search': assert len(pars['similarity']) == len(pars['similarity_search']) searchers = [self._searcher_dict[x] for x in pars['similarity_search']] ids = [s.molecule.identifier for s in searchers] vals = [float(x) for x in pars['similarity']] try: return [CombinedSearch.CombinedHit._from_similarity_hit(match.identifier().str(), ids, vals, _database=_database)] except: print(self._show_node(n), pars) raise else: return self._make_hits(match, n.child(), _database=_database, pars=pars) elif 'entry_limit' in pars: return self._make_hits(match, n.child(), _database=_database, pars=pars) else: raise RuntimeError('Unexpected item in the bagging area\n%s' % ('\n'.join('%s: %s' % (k, v) for k, v in pars.items()))) elif isinstance(n, CSDSQLDatabaseLib.MotifNode): raise RuntimeError('MotifNode: this cannot happen') elif isinstance(n, CSDSQLDatabaseLib.FormulaSearchNode): # Not implemented in the API yet raise NotImplementedError('No FormulaSearch in the API') elif isinstance(n, CSDSQLDatabaseLib.EntryLimitNode): return self._make_hits(match, n.child(), _database=_database, pars=pars) else: raise NotImplementedError('No implementation for %s' % type(n))
[docs] @staticmethod def max_hit_structures(other, count): '''Limit the number of hits found by a combination search. :param other: a combination of searches. :param count: maximum number of hits to find. ''' _other = CombinedSearch._make_node(other) _node = CSDSQLDatabaseLib.EntryLimitNode(_other, count) _node._searcher_dict = _other._searcher_dict s = str(_node) mutator = CSDSQLDatabaseLib.MatchStringDataItemAdder('entry_limit', s) _node = CSDSQLDatabaseLib.MatchMutatorNode(mutator, _node) _node._searcher_dict = { str(other) : other } _node._limit_dict = { s : count } return _node
def _search_reader(self, database=None): if database is None: database = io.EntryReader('csd') subset_db = FileFormatsLib.CrystalStructureDatabaseAsCrystalStructureDatabaseSubset(database._db) if subset_db and isinstance(database._underlying_file_name, list): ids_to_db = collections.defaultdict(list) sub_dbs = [io.EntryReader(f) for f in database._underlying_file_name] for i in sorted(database.identifier(i) for i in range(len(database))): for x, d in enumerate(sub_dbs): if d._db.identifier_exists(UtilitiesLib.DatabaseEntryIdentifier(i)): ids_to_db[x].append(i) break new_pool = FileFormatsLib.CrystalStructureDatabasePool() for k, v in sorted(ids_to_db.items(), key=operator.itemgetter(1)): sub_db = FileFormatsLib.CrystalStructureDatabaseSubset(v, sub_dbs[k]._db) s = FileFormatsLib.CrystalStructureDatabaseSubsetAsCrystalStructureDatabase(sub_db) new_pool.append(s) searcher = new_pool.searcher_factory().combination_searcher() else: searcher = database._db.searcher_factory().combination_searcher() searcher.set_search_definition_node(self._node) class ResultsWriter(object): def __init__(self, searcher, _database=None): self.searcher = searcher self._database = _database self.hits = [] self._matches = [] self.current_id = None self._limit_dict = searcher._limit_dict def __call__(self, **kw): match = kw['match'] identifier = match.identifier().str() if identifier == self.current_id: return pars = match.data().parameters() if 'entry_limit' in pars: for p in pars['entry_limit']: self._limit_dict[p] -= 1 if self._limit_dict[p] < 0: return self.current_id = identifier new_hits = self.searcher._make_hits(match, self.searcher._node, _database=self._database) if self._database.__class__.__name__ == 'MoleculeReader': def skip_suppressed_hit(hit): try: ats = hit.match_atoms() return False except RuntimeError as exc: return True new_hits = [h for h in new_hits if not skip_suppressed_hit(h)] self.hits.extend(new_hits) rw = ResultsWriter(self, _database=database) results_writer = CSDSQLDatabaseLib.PythonResultsWriter(rw) searcher.search(results_writer) return rw.hits def _search_entry(self, entry): raise NotImplementedError('Combined searches not implemented for an entry') def _search_crystal(self, crystal): raise NotImplementedError('Combined searches not implemented for a crystal') def _search_molecule(self, molecule): raise NotImplementedError('Combined searches not implemented for a molecule')
CombinedSearch._monkey_patch()