Source code for xenonpy.descriptor.compositions

#  Copyright (c) 2021. yoshida-lab. All rights reserved.
#  Use of this source code is governed by a BSD-style
#  license that can be found in the LICENSE file.

from typing import Union, List

import numpy as np
import pandas as pd

from xenonpy.descriptor.base import BaseDescriptor, BaseCompositionFeaturizer

__all__ = [
    'Compositions', 'Counting', 'WeightedAverage', 'WeightedSum', 'WeightedVariance',
    'HarmonicMean', 'GeometricMean', 'MaxPooling', 'MinPooling'
]


[docs]class Counting(BaseCompositionFeaturizer): def __init__(self, *, one_hot_vec=False, n_jobs=-1, on_errors='raise', return_type='any', target_col=None): """ Parameters ---------- one_hot_vec : bool Set ``true`` to using one-hot-vector encoding. n_jobs: int The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). Inputs ``X`` will be split into some blocks then run on each cpu cores. on_errors: string How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. return_type: str Specific the return type. Can be ``any``, ``array`` and ``df``. ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively. If ``any``, the return type dependent on the input type. Default is ``any`` target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. If ``None``, all columns of the pd.DataFrame is used. Default is None. """ super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col) self.one_hot_vec = one_hot_vec self._elems = self.elements.index.tolist() self.__authors__ = ['TsumiNa']
[docs] def mix_function(self, elems, nums): vec = np.zeros(len(self._elems), dtype=np.int) for i, e in enumerate(elems): if self.one_hot_vec: vec[self._elems.index(e)] = 1 else: vec[self._elems.index(e)] = nums[i] return vec
@property def feature_labels(self): return self._elems
[docs]class WeightedAverage(BaseCompositionFeaturizer): """ Parameters ---------- elemental_info Elemental level information for each element. For example, the ``atomic number``, ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information. n_jobs: int The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). Inputs ``X`` will be split into some blocks then run on each cpu cores. on_errors: string How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. return_type: str Specific the return type. Can be ``any``, ``array`` and ``df``. ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively. If ``any``, the return type dependent on the input type. Default is ``any`` target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. If ``None``, all columns of the pd.DataFrame is used. Default is None. """
[docs] def mix_function(self, elems, nums): elems_ = self.elements.loc[elems, :].values w_ = nums / np.sum(nums) return w_.dot(elems_)
@property def feature_labels(self): return ['ave:' + s for s in self.elements]
[docs]class WeightedSum(BaseCompositionFeaturizer): """ Parameters ---------- elemental_info Elemental level information for each element. For example, the ``atomic number``, ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information. n_jobs: int The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). Inputs ``X`` will be split into some blocks then run on each cpu cores. on_errors: string How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. return_type: str Specific the return type. Can be ``any``, ``array`` and ``df``. ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively. If ``any``, the return type dependent on the input type. Default is ``any`` target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. If ``None``, all columns of the pd.DataFrame is used. Default is None. """
[docs] def mix_function(self, elems, nums): elems_ = self.elements.loc[elems, :].values w_ = np.array(nums) return w_.dot(elems_)
@property def feature_labels(self): return ['sum:' + s for s in self.elements]
[docs]class GeometricMean(BaseCompositionFeaturizer): """ Parameters ---------- elemental_info Elemental level information for each element. For example, the ``atomic number``, ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information. n_jobs: int The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). Inputs ``X`` will be split into some blocks then run on each cpu cores. on_errors: string How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. return_type: str Specific the return type. Can be ``any``, ``array`` and ``df``. ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively. If ``any``, the return type dependent on the input type. Default is ``any`` target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. If ``None``, all columns of the pd.DataFrame is used. Default is None. """
[docs] def mix_function(self, elems, nums): elems_ = self.elements.loc[elems, :].values w_ = np.array(nums).reshape(-1, 1) tmp = elems_**w_ return np.power(tmp.prod(axis=0), 1 / sum(w_))
@property def feature_labels(self): return ['gmean:' + s for s in self.elements]
[docs]class HarmonicMean(BaseCompositionFeaturizer): """ Parameters ---------- elemental_info Elemental level information for each element. For example, the ``atomic number``, ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information. n_jobs: int The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). Inputs ``X`` will be split into some blocks then run on each cpu cores. on_errors: string How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. return_type: str Specific the return type. Can be ``any``, ``array`` and ``df``. ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively. If ``any``, the return type dependent on the input type. Default is ``any`` target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. If ``None``, all columns of the pd.DataFrame is used. Default is None. """
[docs] def mix_function(self, elems, nums): elems_ = 1 / self.elements.loc[elems, :].values w_ = np.array(nums) tmp = w_.dot(elems_) return sum(w_) / tmp
@property def feature_labels(self): return ['hmean:' + s for s in self.elements]
[docs]class WeightedVariance(BaseCompositionFeaturizer): """ Parameters ---------- elemental_info Elemental level information for each element. For example, the ``atomic number``, ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information. n_jobs: int The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). Inputs ``X`` will be split into some blocks then run on each cpu cores. on_errors: string How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. return_type: str Specific the return type. Can be ``any``, ``array`` and ``df``. ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively. If ``any``, the return type dependent on the input type. Default is ``any`` target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. If ``None``, all columns of the pd.DataFrame is used. Default is None. """
[docs] def mix_function(self, elems, nums): elems_ = self.elements.loc[elems, :].values w_ = nums / np.sum(nums) mean_ = w_.dot(elems_) var_ = elems_ - mean_ return w_.dot(var_**2)
@property def feature_labels(self): return ['var:' + s for s in self.elements]
[docs]class MaxPooling(BaseCompositionFeaturizer): """ Parameters ---------- elemental_info Elemental level information for each element. For example, the ``atomic number``, ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information. n_jobs: int The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). Inputs ``X`` will be split into some blocks then run on each cpu cores. on_errors: string How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. return_type: str Specific the return type. Can be ``any``, ``array`` and ``df``. ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively. If ``any``, the return type dependent on the input type. Default is ``any`` target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. If ``None``, all columns of the pd.DataFrame is used. Default is None. """
[docs] def mix_function(self, elems, _): elems_ = self.elements.loc[elems, :] return elems_.max().values
@property def feature_labels(self): return ['max:' + s for s in self.elements]
[docs]class MinPooling(BaseCompositionFeaturizer): """ Parameters ---------- elemental_info Elemental level information for each element. For example, the ``atomic number``, ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information. n_jobs: int The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). Inputs ``X`` will be split into some blocks then run on each cpu cores. on_errors: string How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. return_type: str Specific the return type. Can be ``any``, ``array`` and ``df``. ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively. If ``any``, the return type dependent on the input type. Default is ``any`` target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. If ``None``, all columns of the pd.DataFrame is used. Default is None. """
[docs] def mix_function(self, elems, _): elems_ = self.elements.loc[elems, :] return elems_.min().values
@property def feature_labels(self): return ['min:' + s for s in self.elements]
[docs]class Compositions(BaseDescriptor): """ Calculate elemental descriptors from compound's composition. """ classic = ['WeightedAverage', 'WeightedSum', 'WeightedVariance', 'MaxPooling', 'MinPooling'] def __init__(self, *, elemental_info: Union[pd.DataFrame, None] = None, n_jobs: int = -1, featurizers: Union[str, List[str]] = 'classic', on_errors: str = 'nan'): """ Parameters ---------- elemental_info Elemental level information for each element. For example, the ``atomic number``, ``atomic radius``, and etc. By default (``None``), will use the XenonPy embedded information. n_jobs: int The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). Inputs ``X`` will be split into some blocks then run on each cpu cores. featurizers: Union[str, List[str]] Name of featurizers that will be used. Set to `classic` to be compatible with the old version. This is equal to set ``featurizers=['WeightedAverage', 'WeightedSum', 'WeightedVariance', 'MaxPooling', 'MinPooling']``. Default is 'all'. on_errors: string How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'nan' which will raise up the exception. """ if featurizers == 'classic': super().__init__(featurizers=self.classic) else: super().__init__(featurizers=featurizers) self.composition = Counting(n_jobs=n_jobs, on_errors=on_errors) self.composition = WeightedAverage(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info) self.composition = WeightedSum(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info) self.composition = WeightedVariance(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info) self.composition = GeometricMean(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info) self.composition = HarmonicMean(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info) self.composition = MaxPooling(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info) self.composition = MinPooling(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info)