Source code for WORC.featureprocessing.StatisticalTestThreshold

#!/usr/bin/env python

# Copyright 2016-2019 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from sklearn.base import BaseEstimator
from sklearn.feature_selection import SelectorMixin
import numpy as np
from scipy.stats import ttest_ind, ranksums, mannwhitneyu


[docs]class StatisticalTestThreshold(BaseEstimator, SelectorMixin): ''' Object to fit feature selection based on statistical tests. '''
[docs] def __init__(self, metric='ttest', threshold=0.05): ''' Parameters ---------- metric: string, default 'ttest' Statistical test used for selection. Options are ttest, Welch, Wilcoxon, MannWhitneyU threshold: float, default 0.05 Threshold for p-value in order for feature to be selected ''' self.metric = metric self.threshold = threshold
[docs] def fit(self, X_train, Y_train): ''' Select only features specificed by the metric and threshold per patient. Parameters ---------- X_train: numpy array, mandatory Array containing feature values used for model_selection. Number of objects on first axis, features on second axis. Y_train: numpy array, mandatory Array containing the binary labels for each object in X_train. ''' self.selectrows = list() self.metric_values = list() # Set the metric function if self.metric == 'ttest': self.metric_function = ttest_ind self.parameters = {'equal_var': True} elif self.metric == 'Welch': self.metric_function = ttest_ind self.parameters = {'equal_var': False} elif self.metric == 'Wilcoxon': self.metric_function = ranksums self.parameters = {} elif self.metric == 'MannWhitneyU': self.metric_function = mannwhitneyu self.parameters = {} # Perform the statistical test for each feature multilabel = type(Y_train[0]) is np.ndarray for n_feat in range(0, X_train.shape[1]): # Select only this specific feature for all objects fv = X_train[:, n_feat] if multilabel: # print('Multilabel: take minimum p-value for all label tests.') # We do a statistical test per label and take the minimum p-value n_label = Y_train[0].shape[0] metric_values = list() for i_label in range(n_label): class1 = [i for j, i in enumerate(fv) if np.argmax(Y_train[j]) == i_label] class2 = [i for j, i in enumerate(fv) if np.argmax(Y_train[j]) != i_label] try: metric_value_temp = self.metric_function(class1, class2, **self.parameters)[1] except ValueError as e: print("[WORC Warning] " + str(e) + '. Replacing metric value by 1.') metric_value_temp metric_values.append(metric_value_temp) metric_value = np.min(metric_values) else: # Singlelabel class1 = [i for j, i in enumerate(fv) if Y_train[j] == 1] class2 = [i for j, i in enumerate(fv) if Y_train[j] == 0] try: metric_value = self.metric_function(class1, class2, **self.parameters)[1] except ValueError as e: print("[WORC Warning] " + str(e) + '. Replacing metric value by 1.') metric_value = 1 self.metric_values.append(metric_value) if metric_value < self.threshold: self.selectrows.append(n_feat)
[docs] def transform(self, inputarray): ''' Transform the inputarray to select only the features based on the result from the fit function. Parameters ---------- inputarray: numpy array, mandatory Array containing the items to use selection on. The type of item in this list does not matter, e.g. floats, strings etc. ''' return np.asarray([np.asarray(x)[self.selectrows].tolist() for x in inputarray])
def _get_support_mask(self): # NOTE: metric is required for the Selector class, but can be empty pass