Source code for xenonpy.descriptor.compositions

#  Copyright (c) 2021. yoshida-lab. All rights reserved.
#  Use of this source code is governed by a BSD-style
#  license that can be found in the LICENSE file.

from typing import Union, List

import numpy as np
import pandas as pd

from xenonpy.descriptor.base import BaseDescriptor, BaseCompositionFeaturizer

__all__ = [
    'Compositions', 'Counting', 'WeightedAverage', 'WeightedSum', 'WeightedVariance',
    'HarmonicMean', 'GeometricMean', 'MaxPooling', 'MinPooling'
]


[docs]class Counting(BaseCompositionFeaturizer):

    def __init__(self,
                 *,
                 one_hot_vec=False,
                 n_jobs=-1,
                 on_errors='raise',
                 return_type='any',
                 target_col=None):
        """

        Parameters
        ----------
        one_hot_vec : bool
            Set ``true`` to using one-hot-vector encoding.
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Set -1 to use all cpu cores (default).
            Inputs ``X`` will be split into some blocks then run on each cpu cores.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        return_type: str
            Specific the return type.
            Can be ``any``, ``array`` and ``df``.
            ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively.
            If ``any``, the return type dependent on the input type.
            Default is ``any``
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """

        super().__init__(n_jobs=n_jobs,
                         on_errors=on_errors,
                         return_type=return_type,
                         target_col=target_col)
        self.one_hot_vec = one_hot_vec
        self._elems = self.elements.index.tolist()
        self.__authors__ = ['TsumiNa']

[docs]    def mix_function(self, elems, nums):
        vec = np.zeros(len(self._elems), dtype=np.int)
        for i, e in enumerate(elems):
            if self.one_hot_vec:
                vec[self._elems.index(e)] = 1
            else:
                vec[self._elems.index(e)] = nums[i]

        return vec

    @property
    def feature_labels(self):
        return self._elems


[docs]class WeightedAverage(BaseCompositionFeaturizer):
    """

    Parameters
    ----------
    elemental_info
        Elemental level information for each element. For example, the ``atomic number``,
        ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information.
    n_jobs: int
        The number of jobs to run in parallel for both fit and predict.
        Set -1 to use all cpu cores (default).
        Inputs ``X`` will be split into some blocks then run on each cpu cores.
    on_errors: string
        How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
        When 'nan', return a column with ``np.nan``.
        The length of column corresponding to the number of feature labs.
        When 'keep', return a column with exception objects.
        The default is 'raise' which will raise up the exception.
    return_type: str
        Specific the return type.
        Can be ``any``, ``array`` and ``df``.
        ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively.
        If ``any``, the return type dependent on the input type.
        Default is ``any``
    target_col
        Only relevant when input is pd.DataFrame, otherwise ignored.
        Specify a single column to be used for transformation.
        If ``None``, all columns of the pd.DataFrame is used.
        Default is None.
    """

[docs]    def mix_function(self, elems, nums):
        elems_ = self.elements.loc[elems, :].values
        w_ = nums / np.sum(nums)
        return w_.dot(elems_)

    @property
    def feature_labels(self):
        return ['ave:' + s for s in self.elements]


[docs]class WeightedSum(BaseCompositionFeaturizer):
    """

    Parameters
    ----------
    elemental_info
        Elemental level information for each element. For example, the ``atomic number``,
        ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information.
    n_jobs: int
        The number of jobs to run in parallel for both fit and predict.
        Set -1 to use all cpu cores (default).
        Inputs ``X`` will be split into some blocks then run on each cpu cores.
    on_errors: string
        How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
        When 'nan', return a column with ``np.nan``.
        The length of column corresponding to the number of feature labs.
        When 'keep', return a column with exception objects.
        The default is 'raise' which will raise up the exception.
    return_type: str
        Specific the return type.
        Can be ``any``, ``array`` and ``df``.
        ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively.
        If ``any``, the return type dependent on the input type.
        Default is ``any``
    target_col
        Only relevant when input is pd.DataFrame, otherwise ignored.
        Specify a single column to be used for transformation.
        If ``None``, all columns of the pd.DataFrame is used.
        Default is None.
    """

[docs]    def mix_function(self, elems, nums):
        elems_ = self.elements.loc[elems, :].values
        w_ = np.array(nums)
        return w_.dot(elems_)

    @property
    def feature_labels(self):
        return ['sum:' + s for s in self.elements]


[docs]class GeometricMean(BaseCompositionFeaturizer):
    """

    Parameters
    ----------
    elemental_info
        Elemental level information for each element. For example, the ``atomic number``,
        ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information.
    n_jobs: int
        The number of jobs to run in parallel for both fit and predict.
        Set -1 to use all cpu cores (default).
        Inputs ``X`` will be split into some blocks then run on each cpu cores.
    on_errors: string
        How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
        When 'nan', return a column with ``np.nan``.
        The length of column corresponding to the number of feature labs.
        When 'keep', return a column with exception objects.
        The default is 'raise' which will raise up the exception.
    return_type: str
        Specific the return type.
        Can be ``any``, ``array`` and ``df``.
        ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively.
        If ``any``, the return type dependent on the input type.
        Default is ``any``
    target_col
        Only relevant when input is pd.DataFrame, otherwise ignored.
        Specify a single column to be used for transformation.
        If ``None``, all columns of the pd.DataFrame is used.
        Default is None.
    """

[docs]    def mix_function(self, elems, nums):
        elems_ = self.elements.loc[elems, :].values
        w_ = np.array(nums).reshape(-1, 1)
        tmp = elems_**w_
        return np.power(tmp.prod(axis=0), 1 / sum(w_))

    @property
    def feature_labels(self):
        return ['gmean:' + s for s in self.elements]


[docs]class HarmonicMean(BaseCompositionFeaturizer):
    """

    Parameters
    ----------
    elemental_info
        Elemental level information for each element. For example, the ``atomic number``,
        ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information.
    n_jobs: int
        The number of jobs to run in parallel for both fit and predict.
        Set -1 to use all cpu cores (default).
        Inputs ``X`` will be split into some blocks then run on each cpu cores.
    on_errors: string
        How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
        When 'nan', return a column with ``np.nan``.
        The length of column corresponding to the number of feature labs.
        When 'keep', return a column with exception objects.
        The default is 'raise' which will raise up the exception.
    return_type: str
        Specific the return type.
        Can be ``any``, ``array`` and ``df``.
        ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively.
        If ``any``, the return type dependent on the input type.
        Default is ``any``
    target_col
        Only relevant when input is pd.DataFrame, otherwise ignored.
        Specify a single column to be used for transformation.
        If ``None``, all columns of the pd.DataFrame is used.
        Default is None.
    """

[docs]    def mix_function(self, elems, nums):
        elems_ = 1 / self.elements.loc[elems, :].values
        w_ = np.array(nums)
        tmp = w_.dot(elems_)

        return sum(w_) / tmp

    @property
    def feature_labels(self):
        return ['hmean:' + s for s in self.elements]


[docs]class WeightedVariance(BaseCompositionFeaturizer):
    """

    Parameters
    ----------
    elemental_info
        Elemental level information for each element. For example, the ``atomic number``,
        ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information.
    n_jobs: int
        The number of jobs to run in parallel for both fit and predict.
        Set -1 to use all cpu cores (default).
        Inputs ``X`` will be split into some blocks then run on each cpu cores.
    on_errors: string
        How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
        When 'nan', return a column with ``np.nan``.
        The length of column corresponding to the number of feature labs.
        When 'keep', return a column with exception objects.
        The default is 'raise' which will raise up the exception.
    return_type: str
        Specific the return type.
        Can be ``any``, ``array`` and ``df``.
        ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively.
        If ``any``, the return type dependent on the input type.
        Default is ``any``
    target_col
        Only relevant when input is pd.DataFrame, otherwise ignored.
        Specify a single column to be used for transformation.
        If ``None``, all columns of the pd.DataFrame is used.
        Default is None.
    """

[docs]    def mix_function(self, elems, nums):
        elems_ = self.elements.loc[elems, :].values
        w_ = nums / np.sum(nums)
        mean_ = w_.dot(elems_)
        var_ = elems_ - mean_
        return w_.dot(var_**2)

    @property
    def feature_labels(self):
        return ['var:' + s for s in self.elements]


[docs]class MaxPooling(BaseCompositionFeaturizer):
    """

    Parameters
    ----------
    elemental_info
        Elemental level information for each element. For example, the ``atomic number``,
        ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information.
    n_jobs: int
        The number of jobs to run in parallel for both fit and predict.
        Set -1 to use all cpu cores (default).
        Inputs ``X`` will be split into some blocks then run on each cpu cores.
    on_errors: string
        How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
        When 'nan', return a column with ``np.nan``.
        The length of column corresponding to the number of feature labs.
        When 'keep', return a column with exception objects.
        The default is 'raise' which will raise up the exception.
    return_type: str
        Specific the return type.
        Can be ``any``, ``array`` and ``df``.
        ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively.
        If ``any``, the return type dependent on the input type.
        Default is ``any``
    target_col
        Only relevant when input is pd.DataFrame, otherwise ignored.
        Specify a single column to be used for transformation.
        If ``None``, all columns of the pd.DataFrame is used.
        Default is None.
    """

[docs]    def mix_function(self, elems, _):
        elems_ = self.elements.loc[elems, :]
        return elems_.max().values

    @property
    def feature_labels(self):
        return ['max:' + s for s in self.elements]


[docs]class MinPooling(BaseCompositionFeaturizer):
    """

    Parameters
    ----------
    elemental_info
        Elemental level information for each element. For example, the ``atomic number``,
        ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information.
    n_jobs: int
        The number of jobs to run in parallel for both fit and predict.
        Set -1 to use all cpu cores (default).
        Inputs ``X`` will be split into some blocks then run on each cpu cores.
    on_errors: string
        How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
        When 'nan', return a column with ``np.nan``.
        The length of column corresponding to the number of feature labs.
        When 'keep', return a column with exception objects.
        The default is 'raise' which will raise up the exception.
    return_type: str
        Specific the return type.
        Can be ``any``, ``array`` and ``df``.
        ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively.
        If ``any``, the return type dependent on the input type.
        Default is ``any``
    target_col
        Only relevant when input is pd.DataFrame, otherwise ignored.
        Specify a single column to be used for transformation.
        If ``None``, all columns of the pd.DataFrame is used.
        Default is None.
    """

[docs]    def mix_function(self, elems, _):
        elems_ = self.elements.loc[elems, :]
        return elems_.min().values

    @property
    def feature_labels(self):
        return ['min:' + s for s in self.elements]


[docs]class Compositions(BaseDescriptor):
    """
    Calculate elemental descriptors from compound's composition.
    """

    classic = ['WeightedAverage', 'WeightedSum', 'WeightedVariance', 'MaxPooling', 'MinPooling']

    def __init__(self,
                 *,
                 elemental_info: Union[pd.DataFrame, None] = None,
                 n_jobs: int = -1,
                 featurizers: Union[str, List[str]] = 'classic',
                 on_errors: str = 'nan'):
        """

        Parameters
        ----------
        elemental_info
            Elemental level information for each element. For example, the ``atomic number``,
            ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information.
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Set -1 to use all cpu cores (default).
            Inputs ``X`` will be split into some blocks then run on each cpu cores.
        featurizers: Union[str, List[str]]
            Name of featurizers that will be used.
            Set to `classic` to be compatible with the old version.
            This is equal to set ``featurizers=['WeightedAverage', 'WeightedSum',
            'WeightedVariance', 'MaxPooling', 'MinPooling']``.
            Default is 'all'.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'nan' which will raise up the exception.
        """

        if featurizers == 'classic':
            super().__init__(featurizers=self.classic)
        else:
            super().__init__(featurizers=featurizers)

        self.composition = Counting(n_jobs=n_jobs, on_errors=on_errors)
        self.composition = WeightedAverage(n_jobs=n_jobs,
                                           on_errors=on_errors,
                                           elemental_info=elemental_info)
        self.composition = WeightedSum(n_jobs=n_jobs,
                                       on_errors=on_errors,
                                       elemental_info=elemental_info)
        self.composition = WeightedVariance(n_jobs=n_jobs,
                                            on_errors=on_errors,
                                            elemental_info=elemental_info)
        self.composition = GeometricMean(n_jobs=n_jobs,
                                         on_errors=on_errors,
                                         elemental_info=elemental_info)
        self.composition = HarmonicMean(n_jobs=n_jobs,
                                        on_errors=on_errors,
                                        elemental_info=elemental_info)
        self.composition = MaxPooling(n_jobs=n_jobs,
                                      on_errors=on_errors,
                                      elemental_info=elemental_info)
        self.composition = MinPooling(n_jobs=n_jobs,
                                      on_errors=on_errors,
                                      elemental_info=elemental_info)