Source code for xenonpy.descriptor.frozen_featurizer

#  Copyright (c) 2021. yoshida-lab. All rights reserved.
#  Use of this source code is governed by a BSD-style
#  license that can be found in the LICENSE file.

import warnings

import numpy as np
import pandas as pd
import torch

from xenonpy.descriptor.base import BaseFeaturizer
from xenonpy.model import SequentialLinear

__all__ = ['FrozenFeaturizer']


[docs]class FrozenFeaturizer(BaseFeaturizer):
    """
    A Featurizer to extract hidden layers a from NN model.
    """

    def __init__(self, model: torch.nn.Module = None, *,
                 cuda: bool = False, depth=None, n_layer=None, 
                 on_errors='raise', return_type='any', target_col=None):
        """
        Parameters
        ----------
        model: torch.nn.Module
            Source model.
        cuda: bool
            If ``true``, run on GPU.
        depth: int
            The depth will be retrieved from NN model.
        n_layer: int
            Number of layer to be retrieved starting from the given depth.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        return_type: str
            Specific the return type.
            Can be ``any``, ``array`` and ``df``.
            ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively.
            If ``any``, the return type dependent on the input type.
            Default is ``any``
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """
        super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type, target_col=target_col)
        self.depth = depth
        self.n_layer = n_layer
        self.model = model
        self.cuda = cuda
        self._ret = []
        self.__authors__ = ['TsumiNa']
        self.model.eval()
        self._depth = 0

[docs]    def featurize(self, descriptor, *, depth=None, n_layer=None):
        if not isinstance(self.model, torch.nn.Module):
            raise TypeError('<model> must be a instance of <torch.nn.Module>')
        hlayers = []
        if isinstance(descriptor, pd.DataFrame):
            descriptor = descriptor.values
        x_ = torch.from_numpy(descriptor).float()
        if self.cuda:
            x_.cuda()
            self.model.cuda()
        else:
            x_.cpu()
            self.model.cpu()

        if isinstance(self.model, SequentialLinear):
            for n, m in self.model.named_children():
                if 'layer_' in n:
                    hlayers.append(m.linear(x_).data)
                    x_ = m(x_)
        else:
            for m in self.model[:-1]:
                hlayers.append(m.layer(x_).data)
                x_ = m(x_)
        
        # get predefined values when depth/n_layer is not given initially
        if depth is None:
            depth = self.depth
        if n_layer is None:
            n_layer = self.n_layer

        # check updated depth values
        if depth is None: # self.depth must be None
            self.depth = len(hlayers) # update self.depth
            self._depth = len(hlayers)
        elif depth > len(hlayers):
            warnings.warn('<depth> is greater than the max depth of hidden layers')
            self._depth = len(hlayers)
        else:
            self._depth = depth

        if n_layer is None:
            l_end = 0
        else:
            l_end = n_layer-self._depth

        if l_end > -1:
            if l_end > 0:
                warnings.warn('<n_layer> is over the max depth of hidden layers starting at the given <depth>')
            ret = hlayers[-self._depth:]
        else:
            ret = hlayers[-self._depth:l_end]
        
        if self.cuda:
            ret = [l.cpu().numpy() for l in ret]
        else:
            ret = [l.numpy() for l in ret]
        self._ret = ret
        return np.concatenate(ret, axis=1)

    @property
    def feature_labels(self):
        if self._depth == 0:
            raise ValueError('Can not generate labels before transform.')
        return ['L(' + str(i - self._depth) + ')_' + str(j + 1)
                for i in range(len(self._ret))
                for j in range(self._ret[i].shape[1])]