Source code for xenonpy.descriptor.fingerprint

#  Copyright (c) 2021. yoshida-lab. All rights reserved.
#  Use of this source code is governed by a BSD-style
#  license that can be found in the LICENSE file.

import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors as ChemDesc
from rdkit.Chem import MACCSkeys as MAC
from rdkit.Chem import rdMolDescriptors as rdMol
from rdkit.Chem import rdmolops as rdm
from rdkit.Chem.rdMHFPFingerprint import MHFPEncoder
from rdkit.ML.Descriptors import MoleculeDescriptors

from scipy.sparse import coo_matrix

from xenonpy.descriptor.base import BaseDescriptor, BaseFeaturizer

__all__ = ['RDKitFP', 'AtomPairFP', 'TopologicalTorsionFP', 'MACCS', 'FCFP', 'ECFP', 'PatternFP', 'LayeredFP',
           'MHFP', 'DescriptorFeature', 'Fingerprints']


def count_fp(fp, dim=2**10):
    tmp = fp.GetNonzeroElements()
    return coo_matrix((list(tmp.values()), (np.repeat(0, len(tmp)), [i % dim for i in tmp.keys()])),
                      shape=(1, dim)).toarray().flatten()


[docs]class RDKitFP(BaseFeaturizer):

    def __init__(self, n_jobs=-1, *, n_bits=2048, bit_per_entry=None, counting=False,
                 input_type='mol', on_errors='raise', return_type='any', target_col=None):
        """
        RDKit fingerprint.

        Parameters
        ----------
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Can be -1 or # of cups. Set -1 to use all cpu cores (default).
        n_bits: int
            Fingerprint size.
        bit_per_entry: int
            Number of bits used to represent a single entry (only for non-counting case).
            Default value follows rdkit default.
        counting: boolean
            Record counts of the entries instead of bits only.
        input_type: string
            Set the specific type of transform input.
            Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
            When set to ``smlies``, ``transform`` method can use a SMILES list as input.
            Set to ``any`` to use both.
            If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
            for ``None`` returns, a ``ValueError`` exception will be raised.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """
        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
        self.input_type = input_type
        self.n_bits = n_bits
        if bit_per_entry is None:
            self.bit_per_entry = 2
        else:
            self.bit_per_entry = bit_per_entry
        self.counting = counting
        self.__authors__ = ['Stephen Wu', 'TsumiNa']

[docs]    def featurize(self, x):
        if self.input_type == 'smiles':
            x_ = x
            x = Chem.MolFromSmiles(x)
            if x is None:
                raise ValueError('cannot convert Mol from SMILES %s' % x_)
        if self.input_type == 'any':
            if not isinstance(x, Chem.rdchem.Mol):
                x_ = x
                x = Chem.MolFromSmiles(x)
                if x is None:
                    raise ValueError('cannot convert Mol from SMILES %s' % x_)

        if self.counting:
            return count_fp(rdm.UnfoldedRDKFingerprintCountBased(x), dim=self.n_bits)
        else:
            return list(Chem.RDKFingerprint(x, fpSize=self.n_bits, nBitsPerHash=self.bit_per_entry))

    @property
    def feature_labels(self):
        if self.counting:
            return ["rdkit_c:" + str(i) for i in range(self.n_bits)]
        else:
            return ["rdkit:" + str(i) for i in range(self.n_bits)]


[docs]class AtomPairFP(BaseFeaturizer):

    def __init__(self, n_jobs=-1, *, n_bits=2048, bit_per_entry=None, counting=False,
                 input_type='mol', on_errors='raise', return_type='any', target_col=None):
        """
        Atom Pair fingerprints.
        Returns the atom-pair fingerprint for a molecule.The algorithm used is described here:
        R.E. Carhart, D.H. Smith, R. Venkataraghavan;
        "Atom Pairs as Molecular Features in Structure-Activity Studies: Definition and Applications"
        JCICS 25, 64-73 (1985).
        This is currently just in binary bits with fixed length after folding.

        Parameters
        ----------
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Can be -1 or # of cups. Set -1 to use all cpu cores (default).
        n_bits: int
           Fixed bit length based on folding.
        bit_per_entry: int
            Number of bits used to represent a single entry (only for non-counting case).
            Default value follows rdkit default.
        counting: boolean
            Record counts of the entries instead of bits only.
        input_type: string
            Set the specific type of transform input.
            Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
            When set to ``smlies``, ``transform`` method can use a SMILES list as input.
            Set to ``any`` to use both.
            If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
            for ``None`` returns, a ``ValueError`` exception will be raised.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """
        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
        self.input_type = input_type
        self.n_bits = n_bits
        if bit_per_entry is None:
            self.bit_per_entry = 4
        else:
            self.bit_per_entry = bit_per_entry
        self.counting = counting
        self.__authors__ = ['Stephen Wu', 'TsumiNa']

[docs]    def featurize(self, x):
        if self.input_type == 'smiles':
            x_ = x
            x = Chem.MolFromSmiles(x)
            if x is None:
                raise ValueError('cannot convert Mol from SMILES %s' % x_)
        if self.input_type == 'any':
            if not isinstance(x, Chem.rdchem.Mol):
                x_ = x
                x = Chem.MolFromSmiles(x)
                if x is None:
                    raise ValueError('cannot convert Mol from SMILES %s' % x_)
        if self.counting:
            return count_fp(rdMol.GetHashedAtomPairFingerprint(x, nBits=self.n_bits), dim=self.n_bits)
        else:
            return list(rdMol.GetHashedAtomPairFingerprintAsBitVect(x, nBits=self.n_bits,
                                                                    nBitsPerEntry=self.bit_per_entry))

    @property
    def feature_labels(self):
        if self.counting:
            return ['apfp_c:' + str(i) for i in range(self.n_bits)]
        else:
            return ['apfp:' + str(i) for i in range(self.n_bits)]


[docs]class TopologicalTorsionFP(BaseFeaturizer):

    def __init__(self, n_jobs=-1, *, n_bits=2048, bit_per_entry=None, counting=False,
                 input_type='mol', on_errors='raise', return_type='any', target_col=None):
        """
        Topological Torsion fingerprints.
        Returns the topological-torsion fingerprint for a molecule.
        This is currently just in binary bits with fixed length after folding.

        Parameters
        ----------
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Can be -1 or # of cups. Set -1 to use all cpu cores (default).
        n_bits: int
           Fixed bit length based on folding.
        bit_per_entry: int
            Number of bits used to represent a single entry (only for non-counting case).
            Default value follows rdkit default.
        counting: boolean
            Record counts of the entries instead of bits only.
        input_type: string
            Set the specific type of transform input.
            Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
            When set to ``smlies``, ``transform`` method can use a SMILES list as input.
            Set to ``any`` to use both.
            If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
            for ``None`` returns, a ``ValueError`` exception will be raised.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """
        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
        self.input_type = input_type
        self.n_bits = n_bits
        if bit_per_entry is None:
            self.bit_per_entry = 4
        else:
            self.bit_per_entry = bit_per_entry
        self.counting = counting
        self.__authors__ = ['Stephen Wu', 'TsumiNa']

[docs]    def featurize(self, x):
        if self.input_type == 'smiles':
            x_ = x
            x = Chem.MolFromSmiles(x)
            if x is None:
                raise ValueError('cannot convert Mol from SMILES %s' % x_)
        if self.input_type == 'any':
            if not isinstance(x, Chem.rdchem.Mol):
                x_ = x
                x = Chem.MolFromSmiles(x)
                if x is None:
                    raise ValueError('cannot convert Mol from SMILES %s' % x_)
        if self.counting:
            return count_fp(rdMol.GetHashedTopologicalTorsionFingerprint(x, nBits=self.n_bits), dim=self.n_bits)
        else:
            return list(rdMol.GetHashedTopologicalTorsionFingerprintAsBitVect(x, nBits=self.n_bits,
                                                                              nBitsPerEntry=self.bit_per_entry))

    @property
    def feature_labels(self):
        if self.counting:
            return ['ttfp_c:' + str(i) for i in range(self.n_bits)]
        else:
            return ['ttfp:' + str(i) for i in range(self.n_bits)]


[docs]class MACCS(BaseFeaturizer):

    def __init__(self, n_jobs=-1,
                 *, input_type='mol', on_errors='raise', return_type='any', target_col=None):
        """
        The MACCS keys for a molecule. The result is a 167-bit vector. There are 166 public keys,
        but to maintain consistency with other software packages they are numbered from 1.

        Parameters
        ----------
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Can be -1 or # of cups. Set -1 to use all cpu cores (default).
        input_type: string
            Set the specific type of transform input.
            Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
            When set to ``smlies``, ``transform`` method can use a SMILES list as input.
            Set to ``any`` to use both.
            If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
            for ``None`` returns, a ``ValueError`` exception will be raised.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """
        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
        self.input_type = input_type
        self.__authors__ = ['Stephen Wu', 'TsumiNa']

[docs]    def featurize(self, x):
        if self.input_type == 'smiles':
            x_ = x
            x = Chem.MolFromSmiles(x)
            if x is None:
                raise ValueError('cannot convert Mol from SMILES %s' % x_)
        if self.input_type == 'any':
            if not isinstance(x, Chem.rdchem.Mol):
                x_ = x
                x = Chem.MolFromSmiles(x)
                if x is None:
                    raise ValueError('cannot convert Mol from SMILES %s' % x_)
        return list(MAC.GenMACCSKeys(x))

    @property
    def feature_labels(self):
        return ['maccs:' + str(i) for i in range(167)]


[docs]class FCFP(BaseFeaturizer):

    def __init__(self, n_jobs=-1, *, radius=3, n_bits=2048, counting=False,
                 input_type='mol', on_errors='raise', return_type='any', target_col=None):
        """
        Morgan (Circular) fingerprints + feature-based (FCFP)
        The algorithm used is described in the paper Rogers, D. & Hahn, M. Extended-Connectivity Fingerprints.
        JCIM 50:742-54 (2010)

        Parameters
        ----------
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Can be -1 or # of cups. Set -1 to use all cpu cores (default).
        radius: int
            The radius parameter in the Morgan fingerprints, which is roughly half of the diameter parameter in FCFP,
            i.e., radius=2 is roughly equivalent to FCFP4.
        n_bits: int
            Fixed bit length based on folding.
        counting: boolean
            Record counts of the entries instead of bits only.
        input_type: string
            Set the specific type of transform input.
            Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
            When set to ``smlies``, ``transform`` method can use a SMILES list as input.
            Set to ``any`` to use both.
            If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
            for ``None`` returns, a ``ValueError`` exception will be raised.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """
        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
        self.input_type = input_type
        self.radius = radius
        self.n_bits = n_bits
        self.counting = counting
        self.__authors__ = ['Stephen Wu', 'TsumiNa']
        # self.arg = arg # arg[0] = radius, arg[1] = bit length

[docs]    def featurize(self, x):
        if self.input_type == 'smiles':
            x_ = x
            x = Chem.MolFromSmiles(x)
            if x is None:
                raise ValueError('cannot convert Mol from SMILES %s' % x_)
        if self.input_type == 'any':
            if not isinstance(x, Chem.rdchem.Mol):
                x_ = x
                x = Chem.MolFromSmiles(x)
                if x is None:
                    raise ValueError('cannot convert Mol from SMILES %s' % x_)
        if self.counting:
            return count_fp(rdMol.GetHashedMorganFingerprint(
                x, radius=self.radius, nBits=self.n_bits, useFeatures=True), dim=self.n_bits)
        else:
            return list(rdMol.GetMorganFingerprintAsBitVect(
                x, radius=self.radius, nBits=self.n_bits, useFeatures=True))

    @property
    def feature_labels(self):
        if self.counting:
            return [f'fcfp{self.radius * 2}_c:' + str(i) for i in range(self.n_bits)]
        else:
            return [f'fcfp{self.radius * 2}:' + str(i) for i in range(self.n_bits)]


[docs]class ECFP(BaseFeaturizer):

    def __init__(self, n_jobs=-1, *, radius=3, n_bits=2048, counting=False,
                 input_type='mol', on_errors='raise', return_type='any', target_col=None):
        """
        Morgan (Circular) fingerprints (ECFP)
        The algorithm used is described in the paper Rogers, D. & Hahn, M. Extended-Connectivity Fingerprints.
        JCIM 50:742-54 (2010)

        Parameters
        ----------
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Can be -1 or # of cups. Set -1 to use all cpu cores (default).
        radius: int
            The radius parameter in the Morgan fingerprints, which is roughly half of the diameter parameter in ECFP,
            i.e., radius=2 is roughly equivalent to ECFP4.
        n_bits: int
            Fixed bit length based on folding.
        counting: boolean
            Record counts of the entries instead of bits only.
        input_type: string
            Set the specific type of transform input.
            Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
            When set to ``smlies``, ``transform`` method can use a SMILES list as input.
            Set to ``any`` to use both.
            If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
            for ``None`` returns, a ``ValueError`` exception will be raised.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """
        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
        self.input_type = input_type
        self.radius = radius
        self.n_bits = n_bits
        self.counting = counting
        self.__authors__ = ['Stephen Wu', 'TsumiNa']
        # self.arg = arg # arg[0] = radius, arg[1] = bit length

[docs]    def featurize(self, x):
        if self.input_type == 'smiles':
            x_ = x
            x = Chem.MolFromSmiles(x)
            if x is None:
                raise ValueError('cannot convert Mol from SMILES %s' % x_)
        if self.input_type == 'any':
            if not isinstance(x, Chem.rdchem.Mol):
                x_ = x
                x = Chem.MolFromSmiles(x)
                if x is None:
                    raise ValueError('cannot convert Mol from SMILES %s' % x_)
        if self.counting:
            return count_fp(rdMol.GetHashedMorganFingerprint(x, radius=self.radius,
                                                             nBits=self.n_bits), dim=self.n_bits)
        else:
            return list(rdMol.GetMorganFingerprintAsBitVect(x, radius=self.radius, nBits=self.n_bits))

    @property
    def feature_labels(self):
        if self.counting:
            return [f'ecfp{self.radius * 2}_c:' + str(i) for i in range(self.n_bits)]
        else:
            return [f'ecfp{self.radius * 2}:' + str(i) for i in range(self.n_bits)]


[docs]class PatternFP(BaseFeaturizer):

    def __init__(self, n_jobs=-1, *, n_bits=2048,
                 input_type='mol', on_errors='raise', return_type='any', target_col=None):
        """
        A fingerprint designed to be used in substructure screening using SMARTS patterns (unique in RDKit).

        Parameters
        ----------
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Can be -1 or # of cups. Set -1 to use all cpu cores (default).
        n_bits: int
           Fixed bit length based on folding.
        input_type: string
            Set the specific type of transform input.
            Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
            When set to ``smlies``, ``transform`` method can use a SMILES list as input.
            Set to ``any`` to use both.
            If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
            for ``None`` returns, a ``ValueError`` exception will be raised.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """
        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
        self.input_type = input_type
        self.n_bits = n_bits
        self.__authors__ = ['Stephen Wu', 'TsumiNa']

[docs]    def featurize(self, x):
        if self.input_type == 'smiles':
            x_ = x
            x = Chem.MolFromSmiles(x)
            if x is None:
                raise ValueError('cannot convert Mol from SMILES %s' % x_)
        if self.input_type == 'any':
            if not isinstance(x, Chem.rdchem.Mol):
                x_ = x
                x = Chem.MolFromSmiles(x)
                if x is None:
                    raise ValueError('cannot convert Mol from SMILES %s' % x_)
        return list(rdm.PatternFingerprint(x, fpSize=self.n_bits))

    @property
    def feature_labels(self):
        return ['patfp:' + str(i) for i in range(self.n_bits)]


[docs]class LayeredFP(BaseFeaturizer):

    def __init__(self, n_jobs=-1, *, n_bits=2048,
                 input_type='mol', on_errors='raise', return_type='any', target_col=None):
        """
        A substructure fingerprint that is more complex than PatternFP (unique in RDKit).

        Parameters
        ----------
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Can be -1 or # of cups. Set -1 to use all cpu cores (default).
        n_bits: int
           Fixed bit length based on folding.
        input_type: string
            Set the specific type of transform input.
            Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
            When set to ``smlies``, ``transform`` method can use a SMILES list as input.
            Set to ``any`` to use both.
            If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
            for ``None`` returns, a ``ValueError`` exception will be raised.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """
        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
        self.input_type = input_type
        self.n_bits = n_bits
        self.__authors__ = ['Stephen Wu', 'TsumiNa']

[docs]    def featurize(self, x):
        if self.input_type == 'smiles':
            x_ = x
            x = Chem.MolFromSmiles(x)
            if x is None:
                raise ValueError('cannot convert Mol from SMILES %s' % x_)
        if self.input_type == 'any':
            if not isinstance(x, Chem.rdchem.Mol):
                x_ = x
                x = Chem.MolFromSmiles(x)
                if x is None:
                    raise ValueError('cannot convert Mol from SMILES %s' % x_)
        return list(rdm.LayeredFingerprint(x, fpSize=self.n_bits))

    @property
    def feature_labels(self):
        return ['layfp:' + str(i) for i in range(self.n_bits)]


[docs]class MHFP(BaseFeaturizer):

    def __init__(self, n_jobs=1, *, radius=3, n_bits=2048,
                 input_type='mol', on_errors='raise', return_type='any', target_col=None):
        """
        Variation from the MinHash fingerprint, which is based on ECFP with
        locality sensitive hashing to increase compactness of information during hashing.
        The algorithm used is described in the paper
        Probst, D. & Reymond, J.-L., A probabilistic molecular fingerprint for big data settings.
        Journal of Cheminformatics, 10:66 (2018)

        Note that MHFP currently does not support parallel computing, so please fix n_jobs to 1.

        Parameters
        ----------
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Can be -1 or # of cups. Set -1 to use all cpu cores (default).
        radius: int
            The radius parameter in the SECFP(RDKit version) fingerprints,
            which is roughly half of the diameter parameter in ECFP,
            i.e., radius=2 is roughly equivalent to ECFP4.
        n_bits: int
           Fixed bit length based on folding.
        input_type: string
            Set the specific type of transform input.
            Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
            When set to ``smlies``, ``transform`` method can use a SMILES list as input.
            Set to ``any`` to use both.
            If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
            for ``None`` returns, a ``ValueError`` exception will be raised.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """
        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
        self.input_type = input_type
        self.radius = radius
        self.n_bits = n_bits
        self.mhfp = MHFPEncoder()
        self.__authors__ = ['Stephen Wu', 'TsumiNa']

[docs]    def featurize(self, x):
        if self.input_type == 'smiles':
            x_ = x
            x = Chem.MolFromSmiles(x)
            if x is None:
                raise ValueError('cannot convert Mol from SMILES %s' % x_)
        if self.input_type == 'any':
            if not isinstance(x, Chem.rdchem.Mol):
                x_ = x
                x = Chem.MolFromSmiles(x)
                if x is None:
                    raise ValueError('cannot convert Mol from SMILES %s' % x_)
        return list(self.mhfp.EncodeSECFPMol(x, radius=self.radius, length=self.n_bits))

    @property
    def feature_labels(self):
        return [f'secfp{self.radius * 2}:' + str(i) for i in range(self.n_bits)]


[docs]class DescriptorFeature(BaseFeaturizer):

    classic = ['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt',
               'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge',
               'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2',
               'FpDensityMorgan3', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n',
               'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3',
               'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14',
               'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9',
               'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7',
               'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2',
               'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'TPSA',
               'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4',
               'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'VSA_EState1', 'VSA_EState10',
               'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8',
               'VSA_EState9', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles',
               'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles',
               'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds',
               'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'RingCount', 'MolLogP',
               'MolMR', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH',
               'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0',
               'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH',
               'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_amidine',
               'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine',
               'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan',
               'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan',
               'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy',
               'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso',
               'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_phos_acid',
               'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine',
               'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole',
               'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea']

    def __init__(self, n_jobs=-1,
                 *, input_type='mol', on_errors='raise', return_type='any', target_col=None, desc_list='all', add_Hs=False):
        """
        All descriptors in RDKit (length = 200) [may include NaN]
            see https://www.rdkit.org/docs/GettingStartedInPython.html#list-of-available-descriptors for the full list

        Parameters
        ----------
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Can be -1 or # of cups. Set -1 to use all cpu cores (default).
        input_type: string
            Set the specific type of transform input.
            Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
            When set to ``smlies``, ``transform`` method can use a SMILES list as input.
            Set to ``any`` to use both.
            If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
            for ``None`` returns, a ``ValueError`` exception will be raised.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        desc_list: string or list
            List of descriptor names to be called in rdkit to calculate molecule descriptors.
            If ``classic``, the full list of rdkit v.2020.03.xx is used. (length = 200)
            Default is to use the latest list available in the rdkit. (length = 208 in rdkit v.2020.09.xx)
        add_Hs: boolean
            Add hydrogen atoms to the mol format in RDKit or not.
            This may affect a few physical descriptors (e.g., charge related ones).
        """
        # self.arg = arg # arg[0] = radius, arg[1] = bit length
        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
        self.input_type = input_type
        self.add_Hs = add_Hs
        if desc_list == 'all':
            self.nms = [x[0] for x in ChemDesc._descList]
        elif desc_list == 'classic':
            self.nms = self.classic
        else:
            self.nms = desc_list
        self.calc = MoleculeDescriptors.MolecularDescriptorCalculator(self.nms)
        self.__authors__ = ['Stephen Wu', 'TsumiNa']

[docs]    def featurize(self, x):
        if self.input_type == 'smiles':
            x_ = x
            x = Chem.MolFromSmiles(x)
            if x is None:
                raise ValueError('cannot convert Mol from SMILES %s' % x_)
            if self.add_Hs:
                x = Chem.AddHs(x)
                if x is None:
                    raise ValueError('cannot add Hs to Mol for %s' % x_)
        if self.input_type == 'any':
            if not isinstance(x, Chem.rdchem.Mol):
                x_ = x
                x = Chem.MolFromSmiles(x)
                if x is None:
                    raise ValueError('cannot convert Mol from SMILES %s' % x_)
            if self.add_Hs:
                x = Chem.AddHs(x)
                if x is None:
                    raise ValueError('cannot add Hs to Mol')
        return self.calc.CalcDescriptors(x)

    @property
    def feature_labels(self):
        return self.nms


[docs]class Fingerprints(BaseDescriptor):
    """
    Calculate fingerprints or descriptors of organic molecules.
    Note that MHFP currently does not support parallel computing, so n_jobs is fixed to 1.
    """

    def __init__(self,
                 n_jobs=-1,
                 *,
                 radius=3,
                 n_bits=2048,
                 bit_per_entry=None,
                 counting=False,
                 input_type='mol',
                 featurizers='all',
                 on_errors='raise',
                 target_col=None,
                 desc_list='all',
                 add_Hs=False):
        """

        Parameters
        ----------
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Can be -1 or # of cpus. Set -1 to use all cpu cores (default).
        radius: int
            The radius parameter in the Morgan fingerprints,
            which is roughly half of the diameter parameter in ECFP/FCFP,
            i.e., radius=2 is roughly equivalent to ECFP4/FCFP4.
        n_bits: int
            Fixed bit length based on folding.
        bit_per_entry: int
            Number of bits used to represent a single entry (only for non-counting case)
            in RDKitFP, AtomPairFP, and TopologicalTorsionFP.
            Default value follows rdkit default.
        counting: boolean
            Record counts of the entries instead of bits only.
        featurizers: list[str] or str or 'all'
            Featurizer(s) that will be used.
            Default is 'all'.
        input_type: string
            Set the specific type of transform input.
            Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
            When set to ``smlies``, ``transform`` method can use a SMILES list as input.
            Set to ``any`` to use both.
            If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
            for ``None`` returns, a ``ValueError`` exception will be raised.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        desc_list: string or list
            List of descriptor names to be called in rdkit to calculate molecule descriptors.
            If ``classic``, the full list of rdkit v.2020.03.xx is used. (length = 200)
            Default is to use the latest list available in the rdkit. (length = 208 in rdkit v.2020.09.xx)
        add_Hs: boolean
            Add hydrogen atoms to the mol format in RDKit or not.
            This may affect a few physical descriptors (e.g., charge related ones) and currently no effect to fingerprints.
        """

        super().__init__(featurizers=featurizers)

        self.mol = RDKitFP(n_jobs, n_bits=n_bits, bit_per_entry=bit_per_entry, counting=counting,
                           input_type=input_type, on_errors=on_errors, target_col=target_col)
        self.mol = AtomPairFP(n_jobs, n_bits=n_bits, bit_per_entry=bit_per_entry, counting=counting,
                              input_type=input_type, on_errors=on_errors, target_col=target_col)
        self.mol = TopologicalTorsionFP(n_jobs, n_bits=n_bits, input_type=input_type, bit_per_entry=bit_per_entry,
                                        counting=counting, on_errors=on_errors, target_col=target_col)
        self.mol = MACCS(n_jobs, input_type=input_type, on_errors=on_errors, target_col=target_col)
        self.mol = ECFP(n_jobs, radius=radius, n_bits=n_bits, input_type=input_type, counting=counting,
                        on_errors=on_errors, target_col=target_col)
        self.mol = FCFP(n_jobs, radius=radius, n_bits=n_bits, input_type=input_type, counting=counting,
                        on_errors=on_errors, target_col=target_col)
        self.mol = PatternFP(n_jobs, n_bits=n_bits, input_type=input_type, on_errors=on_errors, target_col=target_col)
        self.mol = LayeredFP(n_jobs, n_bits=n_bits, input_type=input_type, on_errors=on_errors, target_col=target_col)
        #         self.mol = SECFP(n_jobs, radius=radius, n_bits=n_bits, input_type=input_type, on_errors=on_errors)
        self.mol = MHFP(1, radius=radius, n_bits=n_bits,
                        input_type=input_type, on_errors=on_errors, target_col=target_col)
        self.mol = DescriptorFeature(n_jobs, input_type=input_type,
                                     on_errors=on_errors, target_col=target_col, desc_list=desc_list, add_Hs=add_Hs)