Source code for WORC.classification.metrics

#!/usr/bin/env python

# Copyright 2016-2020 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import division
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import numpy as np
from sklearn import metrics
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import make_scorer, average_precision_score
from sklearn.metrics import check_scoring as check_scoring_sklearn
from scipy.linalg import pinv
from imblearn.metrics import geometric_mean_score


[docs]def performance_singlelabel(y_truth, y_prediction, y_score, regression=False):
    '''
    Singleclass performance metrics
    '''
    if regression:
        y_truth = np.array(y_truth).flatten()
        r2score = metrics.r2_score(y_truth, y_prediction)
        MSE = metrics.mean_squared_error(y_truth, y_prediction)
        coefICC = ICC(np.column_stack((y_prediction, y_truth)))
        C = pearsonr(y_prediction, y_truth)
        PearsonC = C[0]
        PearsonP = C[1]
        C = spearmanr(y_prediction, y_truth)
        SpearmanC = C.correlation
        SpearmanP = C.pvalue

        return r2score, MSE, coefICC, PearsonC, PearsonP, SpearmanC, SpearmanP

    else:
        # Compute confuction matrics and extract measures
        c_mat = confusion_matrix(y_truth, y_prediction)
        if c_mat.shape[0] == 0:
            print('[WORC Warning] No samples in y_truth and y_prediction.')
            TN = 0
            FN = 0
            TP = 0
            FP = 0
        elif c_mat.shape[0] == 1:
            print('[WORC Warning] Only a single class represented in y_truth and y_prediction.')
            if 0 in c_mat:
                TN = c_mat[0, 0]
                FN = 0
                TP = 0
                FP = 0
            else:
                TN = 0
                FN = 0
                TP = c_mat[0, 0]
                FP = 0
        else:
            TN = c_mat[0, 0]
            FN = c_mat[1, 0]
            TP = c_mat[1, 1]
            FP = c_mat[0, 1]

        if FN == 0 and TP == 0:
            if c_mat.shape[0] != 2:
                sensitivity = np.NaN
            else:
                sensitivity = 0
        else:
            sensitivity = float(TP)/(TP+FN)

        if FP == 0 and TN == 0:
            if c_mat.shape[0] != 2:
                specificity = np.NaN
            else:
                specificity = 0
        else:
            specificity = float(TN)/(FP+TN)

        if TP == 0 and FP == 0:
            if c_mat.shape[0] != 2:
                precision = np.NaN
            else:
                precision = 0
        else:
            precision = float(TP)/(TP+FP)

        if TN == 0 and FN == 0:
            if c_mat.shape[0] != 2:
                npv = np.NaN
            else:
                npv = 0
        else:
            npv = float(TN) / (TN + FN)

        # Additionally, compute accuracy, AUC and f1-score
        accuracy = accuracy_score(y_truth, y_prediction)
        BCA = balanced_accuracy_score(y_truth, y_prediction)
        try:
            auc = roc_auc_score(y_truth, y_score)
        except ValueError as e:
            print('[WORC Warning] ' + str(e) + '. Setting AUC to NaN.')
            auc = np.NaN

        f1_score_out = f1_score(y_truth, y_prediction, average='weighted')

        return accuracy, BCA, sensitivity, specificity, precision, npv, f1_score_out, auc


[docs]def performance_multilabel(y_truth, y_prediction, y_score=None, beta=1):
    '''
    Multiclass performance metrics.

    y_truth and y_prediction should both be lists with the multiclass label of each
    object, e.g.

    y_truth = [0, 0,	0,	0,	0,	0,	2,	2,	1,	1,	2]    ### Groundtruth
    y_prediction = [0, 0,	0,	0,	0,	0,	1,	2,	1,	2,	2]    ### Predicted labels
    y_score = [[0.3, 0.3, 0.4], [0.2, 0.6, 0.2], ... ] # Normalized score per patient for all labels (three in this example)


    Calculation of accuracy accorading to formula suggested in CAD Dementia Grand Challege http://caddementia.grand-challenge.org
    and the TADPOLE challenge https://tadpole.grand-challenge.org/Performance_Metrics/
    Calculation of Multi Class AUC according to classpy: https://bitbucket.org/bigr_erasmusmc/classpy/src/master/classpy/multi_class_auc.py

    '''
    cm = confusion_matrix(y_truth, y_prediction)

    # Determine no. of classes
    labels_class = np.unique(y_truth)
    n_class = len(labels_class)

    # Splits confusion matrix in true and false positives and negatives
    TP = np.zeros(shape=(1, n_class), dtype=int)
    FN = np.zeros(shape=(1, n_class), dtype=int)
    FP = np.zeros(shape=(1, n_class), dtype=int)
    TN = np.zeros(shape=(1, n_class), dtype=int)
    n = np.zeros(shape=(1, n_class), dtype=int)
    for i in range(n_class):
        TP[:, i] = cm[i, i]
        FN[:, i] = np.sum(cm[i, :])-cm[i, i]
        FP[:, i] = np.sum(cm[:, i])-cm[i, i]
        TN[:, i] = np.sum(cm[:])-TP[:, i]-FP[:, i]-FN[:, i]

    n = np.sum(cm)

    # Determine Accuracy
    Accuracy = (np.sum(TP))/n

    # BCA: Balanced Class Accuracy
    BCA = list()
    for i in range(n_class):
        BCAi = 1/2*(TP[:, i]/(TP[:, i] + FN[:, i]) + TN[:, i]/(TN[:, i] + FP[:, i]))
        BCA.append(BCAi)

    AverageAccuracy = np.mean(BCA)

    # Determine total positives and negatives
    P = TP + FN
    N = FP + TN

    # Calculation of sensitivity
    Sensitivity = TP/P
    Sensitivity = np.mean(Sensitivity)

    # Calculation of specifitity
    Specificity = TN/N
    Specificity = np.mean(Specificity)

    # Calculation of precision
    Precision = TP/(TP+FP)
    Precision = np.nan_to_num(Precision)
    Precision = np.mean(Precision)

    # Calculation of NPV
    NPV = TN/(TN+FN)
    NPV = np.nan_to_num(NPV)
    NPV = np.mean(NPV)

    # Calculation  of F1_Score
    F1_score = ((1+(beta**2))*(Sensitivity*Precision))/((beta**2)*(Precision + Sensitivity))
    F1_score = np.nan_to_num(F1_score)
    F1_score = np.mean(F1_score)

    # Calculation of Multi Class AUC according to classpy: https://bitbucket.org/bigr_erasmusmc/classpy/src/master/classpy/multi_class_auc.py
    if y_score is not None:
        AUC = multi_class_auc(y_truth, y_score)
    else:
        AUC = None

    return Accuracy, Sensitivity, Specificity, Precision, NPV, F1_score, AUC, AverageAccuracy


[docs]def pairwise_auc(y_truth, y_score, class_i, class_j):
    # Filter out the probabilities for class_i and class_j
    y_score = [est[class_i] for ref, est in zip(y_truth, y_score) if ref in (class_i, class_j)]
    y_truth = [ref for ref in y_truth if ref in (class_i, class_j)]

    # Sort the y_truth by the estimated probabilities
    sorted_y_truth = [y for x, y in sorted(zip(y_score, y_truth), key=lambda p: p[0])]

    # Calculated the sum of ranks for class_i
    sum_rank = 0
    for index, element in enumerate(sorted_y_truth):
        if element == class_i:
            sum_rank += index + 1
    sum_rank = float(sum_rank)

    # Get the counts for class_i and class_j
    n_class_i = float(y_truth.count(class_i))
    n_class_j = float(y_truth.count(class_j))

    # If a class in empty, AUC is 0.0
    if n_class_i == 0 or n_class_j == 0:
        return 0.0

    # Calculate the pairwise AUC
    return (sum_rank - (0.5 * n_class_i * (n_class_i + 1))) / (n_class_i * n_class_j)


[docs]def multi_class_auc(y_truth, y_score):
    classes = np.unique(y_truth)

    # if any(t == 0.0 for t in np.sum(y_score, axis=1)):
    #     raise ValueError('No AUC is calculated, output probabilities are missing')

    pairwise_auc_list = [0.5 * (pairwise_auc(y_truth, y_score, i, j) +
                                pairwise_auc(y_truth, y_score, j, i)) for i in classes for j in classes if i < j]

    c = len(classes)
    return (2.0 * sum(pairwise_auc_list)) / (c * (c - 1))


[docs]def multi_class_auc_score(y_truth, y_score):
    return make_scorer(multi_class_auc, needs_proba=True)


[docs]def f1_weighted_predictproba(y_truth, y_score):
    '''Calculate f1-score, but based on predict_proba instead of predict.

    Probabilities are thresholded at 0.5.
    '''
    # Convert predictions to binary by thresholding at 0.5
    y_pred = np.zeros(y_score.shape)
    y_pred[y_score >= 0.5] = 1
    y_pred[y_score < 0.5] = 0

    # Compute and return score
    return f1_score(y_truth, y_pred, average='weighted')


[docs]def check_scoring(estimator, scoring=None, allow_none=False):
    '''
    Surrogate for sklearn's check_scoring to enable use of some other
    scoring metrics.
    '''
    if scoring == 'average_precision_weighted':
        scorer = make_scorer(average_precision_score, average='weighted', needs_proba=True)
    elif scoring == 'gmean':
        scorer = make_scorer(geometric_mean_score, needs_proba=True)
    elif scoring == 'f1_weighted_predictproba':
        scorer = make_scorer(f1_weighted_predictproba, needs_proba=True)
    else:
        scorer = check_scoring_sklearn(estimator, scoring=scoring)
    return scorer


[docs]def check_multimetric_scoring(estimator, scoring=None):
    """Wrapper around sklearn function to enable more scoring options.


    Check the scoring parameter in cases when multiple metrics are allowed

    Parameters
    ----------
    estimator : sklearn estimator instance
        The estimator for which the scoring will be applied.
    scoring : string, callable, list/tuple, dict or None, default: None
        A single string (see :ref:`scoring_parameter`) or a callable
        (see :ref:`scoring`) to evaluate the predictions on the test set.
        For evaluating multiple metrics, either give a list of (unique) strings
        or a dict with names as keys and callables as values.
        NOTE that when using custom scorers, each scorer should return a single
        value. Metric functions returning a list/array of values can be wrapped
        into multiple scorers that return one value each.
        See :ref:`multimetric_grid_search` for an example.
        If None the estimator's score method is used.
        The return value in that case will be ``{'score': <default_scorer>}``.
        If the estimator's score method is not available, a ``TypeError``
        is raised.
    Returns
    -------
    scorers_dict : dict
        A dict mapping each scorer name to its validated scorer.
    is_multimetric : bool
        True if scorer is a list/tuple or dict of callables
        False if scorer is None/str/callable
    """
    if callable(scoring) or scoring is None or isinstance(scoring,
                                                          str):
        scorers = {"score": check_scoring(estimator, scoring=scoring)}
        return scorers, False
    else:
        err_msg_generic = ("scoring should either be a single string or "
                           "callable for single metric evaluation or a "
                           "list/tuple of strings or a dict of scorer name "
                           "mapped to the callable for multiple metric "
                           "evaluation. Got %s of type %s"
                           % (repr(scoring), type(scoring)))

        if isinstance(scoring, (list, tuple, set)):
            err_msg = ("The list/tuple elements must be unique "
                       "strings of predefined scorers. ")
            invalid = False
            try:
                keys = set(scoring)
            except TypeError:
                invalid = True
            if invalid:
                raise ValueError(err_msg)

            if len(keys) != len(scoring):
                raise ValueError(err_msg + "Duplicate elements were found in"
                                 " the given list. %r" % repr(scoring))
            elif len(keys) > 0:
                if not all(isinstance(k, str) for k in keys):
                    if any(callable(k) for k in keys):
                        raise ValueError(err_msg +
                                         "One or more of the elements were "
                                         "callables. Use a dict of score name "
                                         "mapped to the scorer callable. "
                                         "Got %r" % repr(scoring))
                    else:
                        raise ValueError(err_msg +
                                         "Non-string types were found in "
                                         "the given list. Got %r"
                                         % repr(scoring))
                scorers = {scorer: check_scoring(estimator, scoring=scorer)
                           for scorer in scoring}
            else:
                raise ValueError(err_msg +
                                 "Empty list was given. %r" % repr(scoring))

        elif isinstance(scoring, dict):
            keys = set(scoring)
            if not all(isinstance(k, str) for k in keys):
                raise ValueError("Non-string types were found in the keys of "
                                 "the given dict. scoring=%r" % repr(scoring))
            if len(keys) == 0:
                raise ValueError("An empty dict was passed. %r"
                                 % repr(scoring))
            scorers = {key: check_scoring(estimator, scoring=scorer)
                       for key, scorer in scoring.items()}
        else:
            raise ValueError(err_msg_generic)
        return scorers, True


[docs]def ICC(M, ICCtype='inter'):
    '''
    Input:
        M is matrix of observations. Rows: patients, columns: observers.
        type: ICC type, currently "inter" or "intra".
    '''

    n, k = M.shape

    SStotal = np.var(M, ddof=1) * (n*k - 1)
    MSR = np.var(np.mean(M, 1), ddof=1) * k
    MSW = np.sum(np.var(M, 1, ddof=1)) / n
    MSC = np.var(np.mean(M, 0), ddof=1) * n
    MSE = (SStotal - MSR * (n - 1) - MSC * (k -1)) / ((n - 1) * (k - 1))

    if ICCtype == 'intra':
        r = (MSR - MSW) / (MSR + (k-1)*MSW)
    elif ICCtype == 'inter':
        r = (MSR - MSE) / (MSR + (k-1)*MSE + k*(MSC-MSE)/n)
    else:
        raise ValueError('No valid ICC type given.')

    return r


[docs]def ICC_anova(Y, ICCtype='inter', more=False):
    '''
    Adopted from Nipype with a slight alteration to distinguish inter and intra.
    the data Y are entered as a 'table' ie subjects are in rows and repeated
    measures in columns
    One Sample Repeated measure ANOVA
    Y = XB + E with X = [FaTor / Subjects]
    '''

    [nb_subjects, nb_conditions] = Y.shape
    dfc = nb_conditions - 1
    dfe = (nb_subjects - 1) * dfc
    dfr = nb_subjects - 1

    # Compute the repeated measure effect
    # ------------------------------------

    # Sum Square Total
    mean_Y = np.mean(Y)
    SST = ((Y - mean_Y) ** 2).sum()

    # create the design matrix for the different levels
    x = np.kron(np.eye(nb_conditions), np.ones((nb_subjects, 1)))  # sessions
    x0 = np.tile(np.eye(nb_subjects), (nb_conditions, 1))  # subjects
    X = np.hstack([x, x0])

    # Sum Square Error
    predicted_Y = np.dot(np.dot(np.dot(X, pinv(np.dot(X.T, X))), X.T), Y.flatten('F'))
    residuals = Y.flatten('F') - predicted_Y
    SSE = (residuals ** 2).sum()

    residuals.shape = Y.shape

    MSE = SSE / dfe

    # Sum square session effect - between colums/sessions
    SSC = ((np.mean(Y, 0) - mean_Y) ** 2).sum() * nb_subjects
    MSC = SSC / dfc / nb_subjects

    session_effect_F = MSC / MSE

    # Sum Square subject effect - between rows/subjects
    SSR = SST - SSC - SSE
    MSR = SSR / dfr

    # ICC(3,1) = (mean square subjeT - mean square error) / (mean square subjeT + (k-1)*-mean square error)
    if ICCtype == 'intra':
        ICC = (MSR - MSE) / (MSR + dfc*MSE)
    elif ICCtype == 'inter':
        ICC = (MSR - MSE) / (MSR + dfc*MSE + nb_conditions*(MSC-MSE)/nb_subjects)
    else:
        raise ValueError('No valid ICC type given.')

    e_var = MSE  # variance of error
    r_var = (MSR - MSE) / nb_conditions  # variance between subjects

    if more:
        return ICC, r_var, e_var, session_effect_F, dfc, dfe
    else:
        return ICC