# Copyright (c) 2021. yoshida-lab. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors as ChemDesc
from rdkit.Chem import MACCSkeys as MAC
from rdkit.Chem import rdMolDescriptors as rdMol
from rdkit.Chem import rdmolops as rdm
from rdkit.Chem.rdMHFPFingerprint import MHFPEncoder
from rdkit.ML.Descriptors import MoleculeDescriptors
from scipy.sparse import coo_matrix
from xenonpy.descriptor.base import BaseDescriptor, BaseFeaturizer
__all__ = ['RDKitFP', 'AtomPairFP', 'TopologicalTorsionFP', 'MACCS', 'FCFP', 'ECFP', 'PatternFP', 'LayeredFP',
'MHFP', 'DescriptorFeature', 'Fingerprints']
def count_fp(fp, dim=2**10):
tmp = fp.GetNonzeroElements()
return coo_matrix((list(tmp.values()), (np.repeat(0, len(tmp)), [i % dim for i in tmp.keys()])),
shape=(1, dim)).toarray().flatten()
[docs]class RDKitFP(BaseFeaturizer):
def __init__(self, n_jobs=-1, *, n_bits=2048, bit_per_entry=None, counting=False,
input_type='mol', on_errors='raise', return_type='any', target_col=None):
"""
RDKit fingerprint.
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
n_bits: int
Fingerprint size.
bit_per_entry: int
Number of bits used to represent a single entry (only for non-counting case).
Default value follows rdkit default.
counting: boolean
Record counts of the entries instead of bits only.
input_type: string
Set the specific type of transform input.
Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
When set to ``smlies``, ``transform`` method can use a SMILES list as input.
Set to ``any`` to use both.
If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
for ``None`` returns, a ``ValueError`` exception will be raised.
on_errors: string
How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
When 'nan', return a column with ``np.nan``.
The length of column corresponding to the number of feature labs.
When 'keep', return a column with exception objects.
The default is 'raise' which will raise up the exception.
target_col
Only relevant when input is pd.DataFrame, otherwise ignored.
Specify a single column to be used for transformation.
If ``None``, all columns of the pd.DataFrame is used.
Default is None.
"""
super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
self.input_type = input_type
self.n_bits = n_bits
if bit_per_entry is None:
self.bit_per_entry = 2
else:
self.bit_per_entry = bit_per_entry
self.counting = counting
self.__authors__ = ['Stephen Wu', 'TsumiNa']
[docs] def featurize(self, x):
if self.input_type == 'smiles':
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.counting:
return count_fp(rdm.UnfoldedRDKFingerprintCountBased(x), dim=self.n_bits)
else:
return list(Chem.RDKFingerprint(x, fpSize=self.n_bits, nBitsPerHash=self.bit_per_entry))
@property
def feature_labels(self):
if self.counting:
return ["rdkit_c:" + str(i) for i in range(self.n_bits)]
else:
return ["rdkit:" + str(i) for i in range(self.n_bits)]
[docs]class AtomPairFP(BaseFeaturizer):
def __init__(self, n_jobs=-1, *, n_bits=2048, bit_per_entry=None, counting=False,
input_type='mol', on_errors='raise', return_type='any', target_col=None):
"""
Atom Pair fingerprints.
Returns the atom-pair fingerprint for a molecule.The algorithm used is described here:
R.E. Carhart, D.H. Smith, R. Venkataraghavan;
"Atom Pairs as Molecular Features in Structure-Activity Studies: Definition and Applications"
JCICS 25, 64-73 (1985).
This is currently just in binary bits with fixed length after folding.
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
n_bits: int
Fixed bit length based on folding.
bit_per_entry: int
Number of bits used to represent a single entry (only for non-counting case).
Default value follows rdkit default.
counting: boolean
Record counts of the entries instead of bits only.
input_type: string
Set the specific type of transform input.
Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
When set to ``smlies``, ``transform`` method can use a SMILES list as input.
Set to ``any`` to use both.
If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
for ``None`` returns, a ``ValueError`` exception will be raised.
on_errors: string
How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
When 'nan', return a column with ``np.nan``.
The length of column corresponding to the number of feature labs.
When 'keep', return a column with exception objects.
The default is 'raise' which will raise up the exception.
target_col
Only relevant when input is pd.DataFrame, otherwise ignored.
Specify a single column to be used for transformation.
If ``None``, all columns of the pd.DataFrame is used.
Default is None.
"""
super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
self.input_type = input_type
self.n_bits = n_bits
if bit_per_entry is None:
self.bit_per_entry = 4
else:
self.bit_per_entry = bit_per_entry
self.counting = counting
self.__authors__ = ['Stephen Wu', 'TsumiNa']
[docs] def featurize(self, x):
if self.input_type == 'smiles':
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.counting:
return count_fp(rdMol.GetHashedAtomPairFingerprint(x, nBits=self.n_bits), dim=self.n_bits)
else:
return list(rdMol.GetHashedAtomPairFingerprintAsBitVect(x, nBits=self.n_bits,
nBitsPerEntry=self.bit_per_entry))
@property
def feature_labels(self):
if self.counting:
return ['apfp_c:' + str(i) for i in range(self.n_bits)]
else:
return ['apfp:' + str(i) for i in range(self.n_bits)]
[docs]class TopologicalTorsionFP(BaseFeaturizer):
def __init__(self, n_jobs=-1, *, n_bits=2048, bit_per_entry=None, counting=False,
input_type='mol', on_errors='raise', return_type='any', target_col=None):
"""
Topological Torsion fingerprints.
Returns the topological-torsion fingerprint for a molecule.
This is currently just in binary bits with fixed length after folding.
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
n_bits: int
Fixed bit length based on folding.
bit_per_entry: int
Number of bits used to represent a single entry (only for non-counting case).
Default value follows rdkit default.
counting: boolean
Record counts of the entries instead of bits only.
input_type: string
Set the specific type of transform input.
Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
When set to ``smlies``, ``transform`` method can use a SMILES list as input.
Set to ``any`` to use both.
If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
for ``None`` returns, a ``ValueError`` exception will be raised.
on_errors: string
How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
When 'nan', return a column with ``np.nan``.
The length of column corresponding to the number of feature labs.
When 'keep', return a column with exception objects.
The default is 'raise' which will raise up the exception.
target_col
Only relevant when input is pd.DataFrame, otherwise ignored.
Specify a single column to be used for transformation.
If ``None``, all columns of the pd.DataFrame is used.
Default is None.
"""
super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
self.input_type = input_type
self.n_bits = n_bits
if bit_per_entry is None:
self.bit_per_entry = 4
else:
self.bit_per_entry = bit_per_entry
self.counting = counting
self.__authors__ = ['Stephen Wu', 'TsumiNa']
[docs] def featurize(self, x):
if self.input_type == 'smiles':
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.counting:
return count_fp(rdMol.GetHashedTopologicalTorsionFingerprint(x, nBits=self.n_bits), dim=self.n_bits)
else:
return list(rdMol.GetHashedTopologicalTorsionFingerprintAsBitVect(x, nBits=self.n_bits,
nBitsPerEntry=self.bit_per_entry))
@property
def feature_labels(self):
if self.counting:
return ['ttfp_c:' + str(i) for i in range(self.n_bits)]
else:
return ['ttfp:' + str(i) for i in range(self.n_bits)]
[docs]class MACCS(BaseFeaturizer):
def __init__(self, n_jobs=-1,
*, input_type='mol', on_errors='raise', return_type='any', target_col=None):
"""
The MACCS keys for a molecule. The result is a 167-bit vector. There are 166 public keys,
but to maintain consistency with other software packages they are numbered from 1.
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
input_type: string
Set the specific type of transform input.
Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
When set to ``smlies``, ``transform`` method can use a SMILES list as input.
Set to ``any`` to use both.
If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
for ``None`` returns, a ``ValueError`` exception will be raised.
on_errors: string
How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
When 'nan', return a column with ``np.nan``.
The length of column corresponding to the number of feature labs.
When 'keep', return a column with exception objects.
The default is 'raise' which will raise up the exception.
target_col
Only relevant when input is pd.DataFrame, otherwise ignored.
Specify a single column to be used for transformation.
If ``None``, all columns of the pd.DataFrame is used.
Default is None.
"""
super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
self.input_type = input_type
self.__authors__ = ['Stephen Wu', 'TsumiNa']
[docs] def featurize(self, x):
if self.input_type == 'smiles':
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
return list(MAC.GenMACCSKeys(x))
@property
def feature_labels(self):
return ['maccs:' + str(i) for i in range(167)]
[docs]class FCFP(BaseFeaturizer):
def __init__(self, n_jobs=-1, *, radius=3, n_bits=2048, counting=False,
input_type='mol', on_errors='raise', return_type='any', target_col=None):
"""
Morgan (Circular) fingerprints + feature-based (FCFP)
The algorithm used is described in the paper Rogers, D. & Hahn, M. Extended-Connectivity Fingerprints.
JCIM 50:742-54 (2010)
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
radius: int
The radius parameter in the Morgan fingerprints, which is roughly half of the diameter parameter in FCFP,
i.e., radius=2 is roughly equivalent to FCFP4.
n_bits: int
Fixed bit length based on folding.
counting: boolean
Record counts of the entries instead of bits only.
input_type: string
Set the specific type of transform input.
Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
When set to ``smlies``, ``transform`` method can use a SMILES list as input.
Set to ``any`` to use both.
If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
for ``None`` returns, a ``ValueError`` exception will be raised.
on_errors: string
How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
When 'nan', return a column with ``np.nan``.
The length of column corresponding to the number of feature labs.
When 'keep', return a column with exception objects.
The default is 'raise' which will raise up the exception.
target_col
Only relevant when input is pd.DataFrame, otherwise ignored.
Specify a single column to be used for transformation.
If ``None``, all columns of the pd.DataFrame is used.
Default is None.
"""
super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
self.input_type = input_type
self.radius = radius
self.n_bits = n_bits
self.counting = counting
self.__authors__ = ['Stephen Wu', 'TsumiNa']
# self.arg = arg # arg[0] = radius, arg[1] = bit length
[docs] def featurize(self, x):
if self.input_type == 'smiles':
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.counting:
return count_fp(rdMol.GetHashedMorganFingerprint(
x, radius=self.radius, nBits=self.n_bits, useFeatures=True), dim=self.n_bits)
else:
return list(rdMol.GetMorganFingerprintAsBitVect(
x, radius=self.radius, nBits=self.n_bits, useFeatures=True))
@property
def feature_labels(self):
if self.counting:
return [f'fcfp{self.radius * 2}_c:' + str(i) for i in range(self.n_bits)]
else:
return [f'fcfp{self.radius * 2}:' + str(i) for i in range(self.n_bits)]
[docs]class ECFP(BaseFeaturizer):
def __init__(self, n_jobs=-1, *, radius=3, n_bits=2048, counting=False,
input_type='mol', on_errors='raise', return_type='any', target_col=None):
"""
Morgan (Circular) fingerprints (ECFP)
The algorithm used is described in the paper Rogers, D. & Hahn, M. Extended-Connectivity Fingerprints.
JCIM 50:742-54 (2010)
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
radius: int
The radius parameter in the Morgan fingerprints, which is roughly half of the diameter parameter in ECFP,
i.e., radius=2 is roughly equivalent to ECFP4.
n_bits: int
Fixed bit length based on folding.
counting: boolean
Record counts of the entries instead of bits only.
input_type: string
Set the specific type of transform input.
Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
When set to ``smlies``, ``transform`` method can use a SMILES list as input.
Set to ``any`` to use both.
If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
for ``None`` returns, a ``ValueError`` exception will be raised.
on_errors: string
How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
When 'nan', return a column with ``np.nan``.
The length of column corresponding to the number of feature labs.
When 'keep', return a column with exception objects.
The default is 'raise' which will raise up the exception.
target_col
Only relevant when input is pd.DataFrame, otherwise ignored.
Specify a single column to be used for transformation.
If ``None``, all columns of the pd.DataFrame is used.
Default is None.
"""
super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
self.input_type = input_type
self.radius = radius
self.n_bits = n_bits
self.counting = counting
self.__authors__ = ['Stephen Wu', 'TsumiNa']
# self.arg = arg # arg[0] = radius, arg[1] = bit length
[docs] def featurize(self, x):
if self.input_type == 'smiles':
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.counting:
return count_fp(rdMol.GetHashedMorganFingerprint(x, radius=self.radius,
nBits=self.n_bits), dim=self.n_bits)
else:
return list(rdMol.GetMorganFingerprintAsBitVect(x, radius=self.radius, nBits=self.n_bits))
@property
def feature_labels(self):
if self.counting:
return [f'ecfp{self.radius * 2}_c:' + str(i) for i in range(self.n_bits)]
else:
return [f'ecfp{self.radius * 2}:' + str(i) for i in range(self.n_bits)]
[docs]class PatternFP(BaseFeaturizer):
def __init__(self, n_jobs=-1, *, n_bits=2048,
input_type='mol', on_errors='raise', return_type='any', target_col=None):
"""
A fingerprint designed to be used in substructure screening using SMARTS patterns (unique in RDKit).
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
n_bits: int
Fixed bit length based on folding.
input_type: string
Set the specific type of transform input.
Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
When set to ``smlies``, ``transform`` method can use a SMILES list as input.
Set to ``any`` to use both.
If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
for ``None`` returns, a ``ValueError`` exception will be raised.
on_errors: string
How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
When 'nan', return a column with ``np.nan``.
The length of column corresponding to the number of feature labs.
When 'keep', return a column with exception objects.
The default is 'raise' which will raise up the exception.
target_col
Only relevant when input is pd.DataFrame, otherwise ignored.
Specify a single column to be used for transformation.
If ``None``, all columns of the pd.DataFrame is used.
Default is None.
"""
super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
self.input_type = input_type
self.n_bits = n_bits
self.__authors__ = ['Stephen Wu', 'TsumiNa']
[docs] def featurize(self, x):
if self.input_type == 'smiles':
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
return list(rdm.PatternFingerprint(x, fpSize=self.n_bits))
@property
def feature_labels(self):
return ['patfp:' + str(i) for i in range(self.n_bits)]
[docs]class LayeredFP(BaseFeaturizer):
def __init__(self, n_jobs=-1, *, n_bits=2048,
input_type='mol', on_errors='raise', return_type='any', target_col=None):
"""
A substructure fingerprint that is more complex than PatternFP (unique in RDKit).
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
n_bits: int
Fixed bit length based on folding.
input_type: string
Set the specific type of transform input.
Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
When set to ``smlies``, ``transform`` method can use a SMILES list as input.
Set to ``any`` to use both.
If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
for ``None`` returns, a ``ValueError`` exception will be raised.
on_errors: string
How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
When 'nan', return a column with ``np.nan``.
The length of column corresponding to the number of feature labs.
When 'keep', return a column with exception objects.
The default is 'raise' which will raise up the exception.
target_col
Only relevant when input is pd.DataFrame, otherwise ignored.
Specify a single column to be used for transformation.
If ``None``, all columns of the pd.DataFrame is used.
Default is None.
"""
super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
self.input_type = input_type
self.n_bits = n_bits
self.__authors__ = ['Stephen Wu', 'TsumiNa']
[docs] def featurize(self, x):
if self.input_type == 'smiles':
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
return list(rdm.LayeredFingerprint(x, fpSize=self.n_bits))
@property
def feature_labels(self):
return ['layfp:' + str(i) for i in range(self.n_bits)]
[docs]class MHFP(BaseFeaturizer):
def __init__(self, n_jobs=1, *, radius=3, n_bits=2048,
input_type='mol', on_errors='raise', return_type='any', target_col=None):
"""
Variation from the MinHash fingerprint, which is based on ECFP with
locality sensitive hashing to increase compactness of information during hashing.
The algorithm used is described in the paper
Probst, D. & Reymond, J.-L., A probabilistic molecular fingerprint for big data settings.
Journal of Cheminformatics, 10:66 (2018)
Note that MHFP currently does not support parallel computing, so please fix n_jobs to 1.
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
radius: int
The radius parameter in the SECFP(RDKit version) fingerprints,
which is roughly half of the diameter parameter in ECFP,
i.e., radius=2 is roughly equivalent to ECFP4.
n_bits: int
Fixed bit length based on folding.
input_type: string
Set the specific type of transform input.
Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
When set to ``smlies``, ``transform`` method can use a SMILES list as input.
Set to ``any`` to use both.
If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
for ``None`` returns, a ``ValueError`` exception will be raised.
on_errors: string
How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
When 'nan', return a column with ``np.nan``.
The length of column corresponding to the number of feature labs.
When 'keep', return a column with exception objects.
The default is 'raise' which will raise up the exception.
target_col
Only relevant when input is pd.DataFrame, otherwise ignored.
Specify a single column to be used for transformation.
If ``None``, all columns of the pd.DataFrame is used.
Default is None.
"""
super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
self.input_type = input_type
self.radius = radius
self.n_bits = n_bits
self.mhfp = MHFPEncoder()
self.__authors__ = ['Stephen Wu', 'TsumiNa']
[docs] def featurize(self, x):
if self.input_type == 'smiles':
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
return list(self.mhfp.EncodeSECFPMol(x, radius=self.radius, length=self.n_bits))
@property
def feature_labels(self):
return [f'secfp{self.radius * 2}:' + str(i) for i in range(self.n_bits)]
[docs]class DescriptorFeature(BaseFeaturizer):
classic = ['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt',
'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge',
'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2',
'FpDensityMorgan3', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n',
'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3',
'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14',
'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9',
'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7',
'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2',
'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'TPSA',
'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4',
'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'VSA_EState1', 'VSA_EState10',
'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8',
'VSA_EState9', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles',
'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles',
'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds',
'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'RingCount', 'MolLogP',
'MolMR', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH',
'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0',
'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH',
'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_amidine',
'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine',
'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan',
'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan',
'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy',
'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso',
'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_phos_acid',
'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine',
'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole',
'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea']
def __init__(self, n_jobs=-1,
*, input_type='mol', on_errors='raise', return_type='any', target_col=None, desc_list='all', add_Hs=False):
"""
All descriptors in RDKit (length = 200) [may include NaN]
see https://www.rdkit.org/docs/GettingStartedInPython.html#list-of-available-descriptors for the full list
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
input_type: string
Set the specific type of transform input.
Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
When set to ``smlies``, ``transform`` method can use a SMILES list as input.
Set to ``any`` to use both.
If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
for ``None`` returns, a ``ValueError`` exception will be raised.
on_errors: string
How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
When 'nan', return a column with ``np.nan``.
The length of column corresponding to the number of feature labs.
When 'keep', return a column with exception objects.
The default is 'raise' which will raise up the exception.
target_col
Only relevant when input is pd.DataFrame, otherwise ignored.
Specify a single column to be used for transformation.
If ``None``, all columns of the pd.DataFrame is used.
Default is None.
desc_list: string or list
List of descriptor names to be called in rdkit to calculate molecule descriptors.
If ``classic``, the full list of rdkit v.2020.03.xx is used. (length = 200)
Default is to use the latest list available in the rdkit. (length = 208 in rdkit v.2020.09.xx)
add_Hs: boolean
Add hydrogen atoms to the mol format in RDKit or not.
This may affect a few physical descriptors (e.g., charge related ones).
"""
# self.arg = arg # arg[0] = radius, arg[1] = bit length
super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
self.input_type = input_type
self.add_Hs = add_Hs
if desc_list == 'all':
self.nms = [x[0] for x in ChemDesc._descList]
elif desc_list == 'classic':
self.nms = self.classic
else:
self.nms = desc_list
self.calc = MoleculeDescriptors.MolecularDescriptorCalculator(self.nms)
self.__authors__ = ['Stephen Wu', 'TsumiNa']
[docs] def featurize(self, x):
if self.input_type == 'smiles':
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.add_Hs:
x = Chem.AddHs(x)
if x is None:
raise ValueError('cannot add Hs to Mol for %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.add_Hs:
x = Chem.AddHs(x)
if x is None:
raise ValueError('cannot add Hs to Mol')
return self.calc.CalcDescriptors(x)
@property
def feature_labels(self):
return self.nms
[docs]class Fingerprints(BaseDescriptor):
"""
Calculate fingerprints or descriptors of organic molecules.
Note that MHFP currently does not support parallel computing, so n_jobs is fixed to 1.
"""
def __init__(self,
n_jobs=-1,
*,
radius=3,
n_bits=2048,
bit_per_entry=None,
counting=False,
input_type='mol',
featurizers='all',
on_errors='raise',
target_col=None,
desc_list='all',
add_Hs=False):
"""
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cpus. Set -1 to use all cpu cores (default).
radius: int
The radius parameter in the Morgan fingerprints,
which is roughly half of the diameter parameter in ECFP/FCFP,
i.e., radius=2 is roughly equivalent to ECFP4/FCFP4.
n_bits: int
Fixed bit length based on folding.
bit_per_entry: int
Number of bits used to represent a single entry (only for non-counting case)
in RDKitFP, AtomPairFP, and TopologicalTorsionFP.
Default value follows rdkit default.
counting: boolean
Record counts of the entries instead of bits only.
featurizers: list[str] or str or 'all'
Featurizer(s) that will be used.
Default is 'all'.
input_type: string
Set the specific type of transform input.
Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
When set to ``smlies``, ``transform`` method can use a SMILES list as input.
Set to ``any`` to use both.
If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
for ``None`` returns, a ``ValueError`` exception will be raised.
on_errors: string
How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
When 'nan', return a column with ``np.nan``.
The length of column corresponding to the number of feature labs.
When 'keep', return a column with exception objects.
The default is 'raise' which will raise up the exception.
target_col
Only relevant when input is pd.DataFrame, otherwise ignored.
Specify a single column to be used for transformation.
If ``None``, all columns of the pd.DataFrame is used.
Default is None.
desc_list: string or list
List of descriptor names to be called in rdkit to calculate molecule descriptors.
If ``classic``, the full list of rdkit v.2020.03.xx is used. (length = 200)
Default is to use the latest list available in the rdkit. (length = 208 in rdkit v.2020.09.xx)
add_Hs: boolean
Add hydrogen atoms to the mol format in RDKit or not.
This may affect a few physical descriptors (e.g., charge related ones) and currently no effect to fingerprints.
"""
super().__init__(featurizers=featurizers)
self.mol = RDKitFP(n_jobs, n_bits=n_bits, bit_per_entry=bit_per_entry, counting=counting,
input_type=input_type, on_errors=on_errors, target_col=target_col)
self.mol = AtomPairFP(n_jobs, n_bits=n_bits, bit_per_entry=bit_per_entry, counting=counting,
input_type=input_type, on_errors=on_errors, target_col=target_col)
self.mol = TopologicalTorsionFP(n_jobs, n_bits=n_bits, input_type=input_type, bit_per_entry=bit_per_entry,
counting=counting, on_errors=on_errors, target_col=target_col)
self.mol = MACCS(n_jobs, input_type=input_type, on_errors=on_errors, target_col=target_col)
self.mol = ECFP(n_jobs, radius=radius, n_bits=n_bits, input_type=input_type, counting=counting,
on_errors=on_errors, target_col=target_col)
self.mol = FCFP(n_jobs, radius=radius, n_bits=n_bits, input_type=input_type, counting=counting,
on_errors=on_errors, target_col=target_col)
self.mol = PatternFP(n_jobs, n_bits=n_bits, input_type=input_type, on_errors=on_errors, target_col=target_col)
self.mol = LayeredFP(n_jobs, n_bits=n_bits, input_type=input_type, on_errors=on_errors, target_col=target_col)
# self.mol = SECFP(n_jobs, radius=radius, n_bits=n_bits, input_type=input_type, on_errors=on_errors)
self.mol = MHFP(1, radius=radius, n_bits=n_bits,
input_type=input_type, on_errors=on_errors, target_col=target_col)
self.mol = DescriptorFeature(n_jobs, input_type=input_type,
on_errors=on_errors, target_col=target_col, desc_list=desc_list, add_Hs=add_Hs)