Source code for xenonpy.model.utils.metrics

#  Copyright (c) 2021. TsumiNa. All rights reserved.
#  Use of this source code is governed by a BSD-style
#  license that can be found in the LICENSE file.

from collections import OrderedDict
from typing import Union, List, Tuple

import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, max_error
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

__all__ = ['regression_metrics', 'classification_metrics']


[docs]def regression_metrics(y_true: Union[np.ndarray, pd.Series], y_pred: Union[np.ndarray, pd.Series]) -> OrderedDict:
    """
    Calculate most common regression scores.
    See Also: https://scikit-learn.org/stable/modules/model_evaluation.html
    
    Parameters
    ----------
    y_true
        True results.
    y_pred
        Predicted results.
        
    Returns
    -------
    OrderedDict
        An :class:`collections.OrderedDict` contains regression scores.
        These scores will be calculated: ``mae``, ``mse``, ``rmse``, ``r2``,
        ``pearsonr``, ``spearmanr``, ``p_value``, and ``max_ae``
    """
    if len(y_true.shape) != 1:
        y_true = y_true.flatten()
    if len(y_pred.shape) != 1:
        y_pred = y_pred.flatten()

    mask = ~np.isnan(y_pred)
    y_true = y_true[mask]
    y_pred = y_pred[mask]

    mae = mean_absolute_error(y_true, y_pred)
    maxae = max_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    pr, p_val = pearsonr(y_true, y_pred)
    sr, _ = spearmanr(y_true, y_pred)
    return OrderedDict(
        mae=mae,
        mse=mse,
        rmse=rmse,
        r2=r2,
        pearsonr=pr,
        spearmanr=sr,
        p_value=p_val,
        max_ae=maxae,
    )


[docs]def classification_metrics(
    y_true: Union[np.ndarray, pd.DataFrame, pd.Series],
    y_pred: Union[np.ndarray, pd.Series],
    *,
    average: Union[None, List[str], Tuple[str]] = ('weighted', 'micro', 'macro'),
    labels=None,
) -> dict:
    """
    Calculate most common classification scores.
    See also: https://scikit-learn.org/stable/modules/model_evaluation.html
    
    Parameters
    ----------
    y_true
        True results.
    y_pred
        Predicted results.
    average
        This parameter is required for multiclass/multilabel targets. If None, the scores for each class are returned.
        Otherwise, this determines the type of averaging performed on the data:

        binary:
            Only report results for the class specified by pos_label. This is applicable only if targets (y_{true,pred})
            are binary.
        micro:
            Calculate metrics globally by counting the total true positives, false negatives and false positives.
        macro:
            Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into
            account.
        weighted:
            Calculate metrics for each label, and find their average weighted by support (the number of true instances
            for each label). This alters ``macro`` to account for label imbalance; it can result in an F-score that is
            not between precision and recall.
    labels
        The set of labels to include when average != ``binary``, and their order if average is None.
        Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority
        negative class, while labels not present in the data will result in 0 components in a macro average.
        For multilabel targets, labels are column indices.
        By default, all labels in y_true and y_pred are used in sorted order.

    Returns
    -------
    OrderedDict
        An :class:`collections.OrderedDict` contains classification scores.
        These scores will always contains ``accuracy``, ``f1``, ``precision`` and ``recall``.
        For multilabel targets, based on the selection of the ``average`` parameter, the **weighted**, **micro**,
        and **macro** scores of ``f1`, ``precision``, and ``recall`` will be calculated.
    """
    if average is not None and len(average) == 0:
        raise ValueError('need average')

    if len(y_true.shape) != 1:
        y_true = np.argmax(y_true, 1)
    if len(y_pred.shape) != 1:
        y_pred = np.argmax(y_pred, 1)

    mask = ~np.isnan(y_pred)
    y_true = y_true[mask]
    y_pred = y_pred[mask]

    ret = dict(accuracy=accuracy_score(y_true, y_pred))

    ret.update(
        f1=f1_score(y_true, y_pred, average=None, labels=labels),
        precision=precision_score(y_true, y_pred, average=None, labels=labels),
        recall=recall_score(y_true, y_pred, average=None, labels=labels),
    )

    if 'binary' in average:
        ret.update(
            binary_f1=f1_score(y_true, y_pred, average='binary', labels=labels),
            binary_precision=precision_score(y_true, y_pred, average='binary', labels=labels),
            binary_recall=recall_score(y_true, y_pred, average='binary', labels=labels),
        )

    if 'micro' in average:
        ret.update(
            micro_f1=f1_score(y_true, y_pred, average='micro', labels=labels),
            micro_precision=precision_score(y_true, y_pred, average='micro', labels=labels),
            micro_recall=recall_score(y_true, y_pred, average='micro', labels=labels),
        )

    if 'macro' in average:
        ret.update(
            macro_f1=f1_score(y_true, y_pred, average='macro', labels=labels),
            macro_precision=precision_score(y_true, y_pred, average='macro', labels=labels),
            macro_recall=recall_score(y_true, y_pred, average='macro', labels=labels),
        )
        
    if 'weighted' in average:
        ret.update(
            weighted_f1=f1_score(y_true, y_pred, average='weighted', labels=labels),
            weighted_precision=precision_score(y_true, y_pred, average='weighted', labels=labels),
            weighted_recall=recall_score(y_true, y_pred, average='weighted', labels=labels),
        )

    if 'samples' in average:
        ret.update(
            samples_f1=f1_score(y_true, y_pred, average='samples', labels=labels),
            samples_precision=precision_score(y_true, y_pred, average='samples', labels=labels),
            samples_recall=recall_score(y_true, y_pred, average='samples', labels=labels),
        )

    return ret