Source code for xenonpy.descriptor.base

#  Copyright (c) 2021. yoshida-lab. All rights reserved.
#  Use of this source code is governed by a BSD-style
#  license that can be found in the LICENSE file.

from abc import ABCMeta, abstractmethod
from collections import defaultdict
from collections.abc import Iterable
import itertools
import warnings
from multiprocessing import cpu_count
from typing import DefaultDict, List, Sequence, Union, Set
from joblib import Parallel, delayed

import numpy as np
import pandas as pd
from pymatgen.core.composition import Composition as PMGComp
from sklearn.base import TransformerMixin, BaseEstimator

from xenonpy.datatools.preset import preset
from xenonpy.utils import TimedMetaClass, Switch


[docs]class BaseFeaturizer(BaseEstimator, TransformerMixin, metaclass=ABCMeta): """ Abstract class to calculate features from :class:`pandas.Series` input data. Each entry can be any format such a compound formula or a pymatgen crystal structure dependent on the featurizer implementation. This class have similar structure with `matminer BaseFeaturizer`_ but follow more strict convention. That means you can embed this feature directly into `matminer BaseFeaturizer`_ class implement.:: class MatFeature(BaseFeaturizer): def featurize(self, *x): return <xenonpy_featurizer>.featurize(*x) .. _matminer BaseFeaturizer: https://github.com/hackingmaterials/matminer/blob/master/matminer/featurizers/base_smc.py **Using a BaseFeaturizer Class** :meth:`BaseFeaturizer` implement :class:`sklearn.base.BaseEstimator` and :class:`sklearn.base.TransformerMixin` that means you can use it in a scikit-learn way.:: featurizer = SomeFeaturizer() features = featurizer.fit_transform(X) You can also employ the featurizer as part of a ScikitLearn Pipeline object. You would then provide your input data as an array to the Pipeline, which would output the featurers as an :class:`pandas.DataFrame`. :class:`BaseFeaturizer` also provide you to retrieving proper references for a featurizer. The ``__citations__`` returns a list of papers that should be cited. The ``__authors__`` returns a list of people who wrote the featurizer. Also can be accessed from property ``citations`` and ``citations``. **Implementing a New BaseFeaturizer Class** These operations must be implemented for each new featurizer: - ``featurize`` - Takes a single material as input, returns the features of that material. - ``feature_labels`` - Generates a human-meaningful name for each of the features. **Implement this as property**. Also suggest to implement these two **properties**: - ``citations`` - Returns a list of citations in BibTeX format. - ``implementors`` - Returns a list of people who contributed writing a paper. All options of the featurizer must be set by the ``__init__`` function. All options must be listed as keyword arguments with default values, and the value must be saved as a class attribute with the same name or as a property (e.g., argument `n` should be stored in `self.n`). These requirements are necessary for compatibility with the ``get_params`` and ``set_params`` methods of ``BaseEstimator``, which enable easy interoperability with scikit-learn. :meth:`featurize` must return a list of features in :class:`numpy.ndarray`. .. note:: None of these operations should change the state of the featurizer. I.e., running each method twice should no produce different results, no class attributes should be changed, running one operation should not affect the output of another. """ __authors__ = ['anonymous'] __citations__ = ['No citations'] def __init__( self, n_jobs: int = -1, *, on_errors: str = 'raise', return_type: str = 'any', target_col: Union[List[str], str, None] = None, parallel_verbose: int = 0, ): """ Parameters ---------- n_jobs The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). Inputs ``X`` will be split into some blocks then run on each cpu cores. When set to 0, input X will be treated as a block and pass to ``Featurizer.featurize`` directly. This default parallel implementation does not support pd.DataFrame input, so please make sure you set n_jobs=0 if the input will be pd.DataFrame. on_errors How to handle the exceptions in a feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. return_type Specify the return type. Can be ``any``, ``custom``, ``array`` and ``df``. ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively. If ``any`` or ``custom``, the return type depends on multiple factors (see transform function). Default is ``any`` target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. If ``None``, all columns of the pd.DataFrame is used. Default is None. parallel_verbose The verbosity level: if non zero, progress messages are printed. Above 50, the output is sent to stdout. The frequency of the messages increases with the verbosity level. If it more than 10, all iterations are reported. Default ``0``. """ self.return_type = return_type self.target_col = target_col self.n_jobs = n_jobs self.on_errors = on_errors self.parallel_verbose = parallel_verbose self._kwargs = {} @property def return_type(self): return self._return_type @return_type.setter def return_type(self, val): if val not in {'any', 'array', 'df', 'custom'}: raise ValueError('`return_type` must be `any`, `custom`, `array` or `df`') self._return_type = val @property def on_errors(self): return self._on_errors @on_errors.setter def on_errors(self, val): if val not in {'nan', 'keep', 'raise'}: raise ValueError('`on_errors` must be `nan`, `keep` or `raise`') self._on_errors = val @property def parallel_verbose(self): return self._parallel_verbose @parallel_verbose.setter def parallel_verbose(self, val): if not isinstance(val, int): raise ValueError('`parallel_verbose` must be int') self._parallel_verbose = val @property def n_jobs(self): return self._n_jobs @n_jobs.setter def n_jobs(self, n_jobs): """Set the number of threads for this """ if n_jobs < -1: n_jobs = -1 if n_jobs > cpu_count() or n_jobs == -1: self._n_jobs = cpu_count() else: self._n_jobs = n_jobs
[docs] def fit(self, X, y=None, **fit_kwargs): """Update the parameters of this featurizer based on available data Args: X - [list of tuples], training data Returns: self """ return self
# todo: Dose fit_transform need to pass paras to transform?
[docs] def fit_transform(self, X, y=None, **fit_params): """Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X. Parameters ---------- X : numpy array of shape [n_samples, n_features] Training set. y : numpy array of shape [n_samples] Target values. Returns ------- X_new : numpy array of shape [n_samples, n_features_new] Transformed array. """ # non-optimized default implementation; override when a better # method is possible for a given clustering algorithm if y is None: # fit method of arity 1 (unsupervised transformation) return self.fit(X, **fit_params).transform(X, **fit_params) else: # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X, **fit_params)
[docs] def transform(self, entries: Sequence, *, return_type=None, target_col=None, **kwargs): """ Featurize a list of entries. If `featurize` takes multiple inputs, supply inputs as a list of tuples, or use pd.DataFrame with parameter ``target_col`` to specify the column name(s). Args ---- entries: list-like or pd.DataFrame A list of entries to be featurized or pd.DataFrame with one specified column. See detail of target_col if entries is pd.DataFrame. Also, make sure n_jobs=0 for pd.DataFrame. return_type: str Specify the return type. Can be ``any``, ``custom``, ``array`` or ``df``. ``array`` or ``df`` forces return type to ``np.ndarray`` or ``pd.DataFrame``, respectively. If ``any``, the return type follow prefixed rules: (1) if input type is pd.Series or pd.DataFrame, returns pd.DataFrame; (2) else if input type is np.array, returns np.array; (3) else if other input type and n_jobs=0, follows the featurize function return; (4) otherwise, return a list of objects (output of featurize function). If ``custom``, the return type depends on the featurize function if n_jobs=0, or the return type is a list of objects (output of featurize function) for other n_jobs values. This is a one-time change that only have effect in the current transformation. Default is ``None`` for using the setting at initialization step. target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. Default is ``None`` for using the setting at initialization step. (see __init__ for more information) Returns ------- DataFrame features for each entry. """ self._kwargs = kwargs # Check inputs if not isinstance(entries, Iterable): raise TypeError('parameter "entries" must be a iterable object') # Extract relevant columns for pd.DataFrame input if isinstance(entries, pd.DataFrame): if target_col is None: target_col = self.target_col if target_col is None: target_col = entries.columns.values entries = entries[target_col] # Special case: Empty list if len(entries) is 0: return [] # Check outputs if return_type not in {None, 'any', 'array', 'df', 'custom'}: raise ValueError('`return_type` must be None, `any`, `custom`, `array` or `df`') for c in Switch(self._n_jobs): if c(0): # Run the actual featurization ret = self.featurize(entries, **kwargs) break if isinstance(entries, pd.DataFrame): raise RuntimeError("Auto-parallel can not be used when`entries` is `pandas.DataFrame`. " "Please set `n_jobs` to 0 and implements your algorithm in the `featurize` method") if c(1): ret = [self._wrapper(x) for x in entries] break if c(): ret = Parallel(n_jobs=self._n_jobs, verbose=self._parallel_verbose)(delayed(self._wrapper)(x) for x in entries) try: labels = self.feature_labels except NotImplementedError: labels = None if return_type is None: return_type = self.return_type if return_type == 'any': if isinstance(entries, (pd.Series, pd.DataFrame)): tmp = pd.DataFrame(ret, index=entries.index, columns=labels) return tmp if isinstance(entries, np.ndarray): return np.array(ret) return ret if return_type == 'array': return np.array(ret) if return_type == 'df': if isinstance(entries, (pd.Series, pd.DataFrame)): return pd.DataFrame(ret, index=entries.index, columns=labels) return pd.DataFrame(ret, columns=labels) if return_type == 'custom': return ret
def _wrapper(self, x): """ An exception wrapper for featurize, used in featurize_many and featurize_dataframe. featurize_wrapper changes the behavior of featurize when ignore_errors is True in featurize_many/dataframe. Args: x: input data to featurize (type depends on featurizer). Returns: (list) one or more features. """ try: # Successful featurization returns nan for an error. if not isinstance(x, (tuple, list, np.ndarray)): return self.featurize(x, **self._kwargs) return self.featurize(*x, **self._kwargs) except Exception as e: if self._on_errors == 'nan': return [np.nan] * len(self.feature_labels) elif self._on_errors == 'keep': return [e] * len(self.feature_labels) else: raise e
[docs] @abstractmethod def featurize(self, *x, **kwargs): """ Main featurizer function, which has to be implemented in any derived featurizer subclass. Args ==== x: depends on featurizer input data to featurize. Returns ======= any: numpy.ndarray one or more features. """
@property @abstractmethod def feature_labels(self): """ Generate attribute names. Returns: ([str]) attribute labels. """ @property def citations(self): """ Citation(s) and reference(s) for this feature. Returns: (list) each element should be a string citation, ideally in BibTeX format. """ return '\n'.join(self.__citations__) @property def authors(self): """ List of implementors of the feature. Returns: (list) each element should either be a string with author name (e.g., "Anubhav Jain") or a dictionary with required key "name" and other keys like "email" or "institution" (e.g., {"name": "Anubhav Jain", "email": "ajain@lbl.gov", "institution": "LBNL"}). """ return '\n'.join(self.__authors__)
[docs]class BaseDescriptor(BaseEstimator, TransformerMixin, metaclass=TimedMetaClass): """ Abstract class to organize featurizers. This class can take list-like[object] or pd.DataFrame as input for transformation or fitting. For pd.DataFrame, if any column name matches any group name, the matched group(s) will be calculated with corresponding column(s); otherwise, the pd.DataFrame will be passed on as is. Examples -------- .. code:: class MyDescriptor(BaseDescriptor): def __init__(self, n_jobs=-1): self.descriptor = SomeFeature1(n_jobs) self.descriptor = SomeFeature2(n_jobs) self.descriptor = SomeFeature3(n_jobs) self.descriptor = SomeFeature4(n_jobs) """ def __init__(self, *, featurizers: Union[List[str], str] = 'all', on_errors: str = 'raise'): """ Parameters ---------- featurizers Specify which Featurizer(s) will be used. Default is 'all'. on_errors How to handle the exceptions in a feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. """ self.__featurizers__: Set[str] = set() # featurizers' names self.__featurizer_sets__: DefaultDict[str, List[BaseFeaturizer]] = defaultdict(list) self.featurizers = featurizers self.on_errors = on_errors @property def on_errors(self): return self._on_errors @on_errors.setter def on_errors(self, val): if val not in {'nan', 'keep', 'raise'}: raise ValueError('`on_errors` must be `nan`, `keep` or `raise`') self._on_errors = val for fea_set in self.__featurizer_sets__.values(): for fea in fea_set: fea.on_errors = val @property def featurizers(self): return self._featurizers @featurizers.setter def featurizers(self, val): if isinstance(val, str): if val != 'all': self._featurizers = (val,) else: self._featurizers = val elif isinstance(val, (tuple, List)): self._featurizers = tuple(val) else: raise ValueError('parameter `featurizers` must be `all`, name of featurizer, or list of name of featurizer') @property def elapsed(self): return self._timer.elapsed def __setattr__(self, key, value): if key == '__featurizer_sets__': if not isinstance(value, defaultdict): raise RuntimeError('Can not set "self.__featurizer_sets__" by yourself') super().__setattr__(key, value) if isinstance(value, BaseFeaturizer): if value.__class__.__name__ in self.__featurizers__: raise RuntimeError('Duplicated featurizer <%s>' % value.__class__.__name__) self.__featurizer_sets__[key].append(value) self.__featurizers__.add(value.__class__.__name__) else: super().__setattr__(key, value) def __repr__(self): return self.__class__.__name__ + ':\n' + \ '\n'.join( [' |- %s:\n | |- %s' % (k, '\n | |- '.join(map(lambda s: s.__class__.__name__, v))) for k, v in self.__featurizer_sets__.items()]) def _check_input(self, X, y=None, **kwargs): def _reformat(x): if x is None: return x keys = list(self.__featurizer_sets__.keys()) if len(keys) == 1: if isinstance(x, list): return pd.DataFrame(pd.Series(x), columns=keys) if isinstance(x, np.ndarray): if len(x.shape) == 1: return pd.DataFrame(x, columns=keys) if isinstance(x, pd.Series): return pd.DataFrame(x.values, columns=keys, index=x.index) if isinstance(x, pd.Series): x = pd.DataFrame(x) if isinstance(x, pd.DataFrame): tmp = set(x.columns) | set(kwargs.keys()) if set(keys).isdisjoint(tmp): # raise KeyError('name of columns do not match any feature set') warnings.warn( 'name of columns do not match any feature set, ' 'the whole dataframe is applied to all feature sets', UserWarning) # allow type check later for this special case return [x] return x raise TypeError('you can not ues a array-like input ' 'because there are multiple feature sets or the dim of input is not 1') return _reformat(X), _reformat(y) def _rename(self, **fit_params): for k, v in fit_params.items(): if k in self.__featurizer_sets__: self.__featurizer_sets__[v] = self.__featurizer_sets__.pop(k) @property def all_featurizers(self): return list(self.__featurizers__)
[docs] def fit(self, X, y=None, **kwargs): if not isinstance(X, Iterable): raise TypeError('parameter "entries" must be a iterable object') self._rename(**kwargs) # assume y is in same format of X (do not cover other cases now) X, y = self._check_input(X, y) if isinstance(X, list): for k, features in self.__featurizer_sets__.items(): for f in features: if self._featurizers != 'all' and f.__class__.__name__ not in self._featurizers: continue # assume y is in same format of X if y is not None: f.fit(X[0], y[0], **kwargs) else: f.fit(X[0], **kwargs) else: for k, features in self.__featurizer_sets__.items(): if k in X: for f in features: if self._featurizers != 'all' and f.__class__.__name__ not in self._featurizers: continue if y is not None and k in y: f.fit(X[k], y[k], **kwargs) else: f.fit(X[k], **kwargs) return self
[docs] def transform(self, X, **kwargs): if not isinstance(X, Iterable): raise TypeError('parameter "entries" must be a iterable object') if len(X) is 0: return None if 'return_type' in kwargs: del kwargs['return_type'] results = [] X, _ = self._check_input(X, **kwargs) if isinstance(X, list): for k, features in self.__featurizer_sets__.items(): # if k in kwargs: # k = kwargs[k] for f in features: if self._featurizers != 'all' and f.__class__.__name__ not in self._featurizers: continue ret = f.transform(X[0], return_type='df', **kwargs) results.append(ret) else: for k, features in self.__featurizer_sets__.items(): if k in kwargs: k = kwargs[k] if k in X: for f in features: if self._featurizers != 'all' and f.__class__.__name__ not in self._featurizers: continue ret = f.transform(X[k], return_type='df', **kwargs) results.append(ret) return pd.concat(results, axis=1)
@property def feature_labels(self): """ Generate attribute names. Returns: ([str]) attribute labels. """ if len(self.__featurizers__) == 0: raise NotImplementedError("no featurizers") ret = () for k, features in self.__featurizer_sets__.items(): ret += ((k, list(itertools.chain.from_iterable([f.feature_labels for f in features]))),) if len(ret) == 1: return ret[0][1] return ret
[docs]class BaseCompositionFeaturizer(BaseFeaturizer, metaclass=ABCMeta): def __init__(self, *, elemental_info: Union[pd.DataFrame, None] = None, n_jobs: int = -1, on_errors: str = 'raise', return_type: str = 'any', target_col: Union[List[str], str, None] = None): """ Base class for composition feature. """ super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col) if elemental_info is None: self.elements = preset.elements_completed.copy() else: self.elements = elemental_info self.__authors__ = ['TsumiNa']
[docs] def featurize(self, comp): elems_, nums_ = [], [] if isinstance(comp, PMGComp): comp = comp.as_dict() for e, n in comp.items(): elems_.append(e) nums_.append(n) return self.mix_function(elems_, nums_)
[docs] @abstractmethod def mix_function(self, elems, nums): """ Parameters ---------- elems: list Elements in compound. nums: list Number of each element. Returns ------- descriptor: numpy.ndarray """