Source code for xenonpy.descriptor.frozen_featurizer

#  Copyright (c) 2021. yoshida-lab. All rights reserved.
#  Use of this source code is governed by a BSD-style
#  license that can be found in the LICENSE file.

import warnings

import numpy as np
import pandas as pd
import torch

from xenonpy.descriptor.base import BaseFeaturizer
from xenonpy.model import SequentialLinear

__all__ = ['FrozenFeaturizer']


[docs]class FrozenFeaturizer(BaseFeaturizer): """ A Featurizer to extract hidden layers a from NN model. """ def __init__(self, model: torch.nn.Module = None, *, cuda: bool = False, depth=None, n_layer=None, on_errors='raise', return_type='any', target_col=None): """ Parameters ---------- model: torch.nn.Module Source model. cuda: bool If ``true``, run on GPU. depth: int The depth will be retrieved from NN model. n_layer: int Number of layer to be retrieved starting from the given depth. on_errors: string How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. return_type: str Specific the return type. Can be ``any``, ``array`` and ``df``. ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively. If ``any``, the return type dependent on the input type. Default is ``any`` target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. If ``None``, all columns of the pd.DataFrame is used. Default is None. """ super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type, target_col=target_col) self.depth = depth self.n_layer = n_layer self.model = model self.cuda = cuda self._ret = [] self.__authors__ = ['TsumiNa'] self.model.eval() self._depth = 0
[docs] def featurize(self, descriptor, *, depth=None, n_layer=None): if not isinstance(self.model, torch.nn.Module): raise TypeError('<model> must be a instance of <torch.nn.Module>') hlayers = [] if isinstance(descriptor, pd.DataFrame): descriptor = descriptor.values x_ = torch.from_numpy(descriptor).float() if self.cuda: x_.cuda() self.model.cuda() else: x_.cpu() self.model.cpu() if isinstance(self.model, SequentialLinear): for n, m in self.model.named_children(): if 'layer_' in n: hlayers.append(m.linear(x_).data) x_ = m(x_) else: for m in self.model[:-1]: hlayers.append(m.layer(x_).data) x_ = m(x_) # get predefined values when depth/n_layer is not given initially if depth is None: depth = self.depth if n_layer is None: n_layer = self.n_layer # check updated depth values if depth is None: # self.depth must be None self.depth = len(hlayers) # update self.depth self._depth = len(hlayers) elif depth > len(hlayers): warnings.warn('<depth> is greater than the max depth of hidden layers') self._depth = len(hlayers) else: self._depth = depth if n_layer is None: l_end = 0 else: l_end = n_layer-self._depth if l_end > -1: if l_end > 0: warnings.warn('<n_layer> is over the max depth of hidden layers starting at the given <depth>') ret = hlayers[-self._depth:] else: ret = hlayers[-self._depth:l_end] if self.cuda: ret = [l.cpu().numpy() for l in ret] else: ret = [l.numpy() for l in ret] self._ret = ret return np.concatenate(ret, axis=1)
@property def feature_labels(self): if self._depth == 0: raise ValueError('Can not generate labels before transform.') return ['L(' + str(i - self._depth) + ')_' + str(j + 1) for i in range(len(self._ret)) for j in range(self._ret[i].shape[1])]