Source code for WORC.featureprocessing.Relief

#!/usr/bin/env python

# Copyright 2016-2021 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from sklearn.base import BaseEstimator
from sklearn.feature_selection import SelectorMixin
import numpy as np
import sklearn.neighbors as nn
# from skrebate import ReliefF


[docs]class SelectMulticlassRelief(BaseEstimator, SelectorMixin): ''' Object to fit feature selection based on the type group the feature belongs to. The label for the feature is used for this procedure. '''
[docs] def __init__(self, n_neighbours=3, sample_size=1, distance_p=2, numf=None, random_state=None): ''' Parameters ---------- n_neightbors: integer Number of nearest neighbours used. sample_size: float Percentage of samples used to calculate score distance_p: integer Parameter in minkov distance usde for nearest neighbour calculation numf: integer, default None Number of important features to be selected with respect to their ranking. If None, all are used. ''' self.no_neighbours = n_neighbours self.sample_size = sample_size self.distance_p = distance_p self.numf = numf self.random_state = random_state
[docs] def fit(self, X, y, random_state=None): ''' Select only features specificed by parameters per patient. Parameters ---------- feature_values: numpy array, mandatory Array containing feature values used for model_selection. Number of objects on first axis, features on second axis. feature_labels: list, mandatory Contains the labels of all features used. The index in this list will be used in the transform funtion to select features. ''' # Multiclass relief function if len(y.shape) > 1: indices, _ = self.multi_class_relief(X, y, nb=self.no_neighbours, sample_size=self.sample_size, distance_p=self.distance_p, numf=self.numf, random_state=self.random_state) else: indices, _ = self.single_class_relief(X, y, nb=self.no_neighbours, sample_size=self.sample_size, distance_p=self.distance_p, numf=self.numf, random_state=self.random_state) self.selectrows = indices
[docs] def transform(self, inputarray): ''' Transform the inputarray to select only the features based on the result from the fit function. Parameters ---------- inputarray: numpy array, mandatory Array containing the items to use selection on. The type of item in this list does not matter, e.g. floats, strings etc. ''' return np.asarray([np.asarray(x)[self.selectrows].tolist() for x in inputarray])
# return self.ReliefF.transform(inputarray) def _get_support_mask(self): # NOTE: Method is required for the Selector class, but can be empty pass
[docs] def multi_class_relief(self, feature_set, label_set, nb=3, sample_size=1, distance_p=2, numf=None, random_state=None): nrow, ncol = feature_set.shape nlabel = label_set.shape[1] np.random.seed(random_state) sample_list = np.random.choice(range(nrow), int(nrow * sample_size), replace=False) feature_score = np.zeros((1, ncol)) prob = label_set.mean(axis=0) n_sample = dict() pair_score = dict() # find positive and negative samples for each label for label in range(nlabel): n_sample[label, 0] = [] n_sample[label, 1] = [] for row in sample_list: if label_set[row, label] == 0: n_sample[label, 0].append(row) else: n_sample[label, 1].append(row) for label1 in range(nlabel - 1): for label2 in range(label1 + 1, nlabel): pair_score[label1, label2] = np.zeros((1, ncol)) if n_sample[label1, 0].__len__() >= nb and n_sample[label1, 1].__len__() >= nb: # find near miss for label1 n_neighbor_finder = nn.NearestNeighbors(n_neighbors=nb, p=distance_p) n_neighbor_finder.fit(np.asarray(feature_set[n_sample[label1, 0], :])) near_miss = n_neighbor_finder.kneighbors(np.asarray(feature_set[n_sample[label1, 0], :]), return_distance=False) # find near hit for label1 p_neighbor_finder = nn.NearestNeighbors(n_neighbors=nb, p=distance_p) p_neighbor_finder.fit(np.asarray(feature_set[n_sample[label1, 1], :])) near_hit = p_neighbor_finder.kneighbors(np.asarray(feature_set[n_sample[label1, 1], :]), return_distance=False) for label2 in range(label1 + 1, nlabel): for r in range(near_miss.__len__()): for c in range(nb): if label_set[near_miss[r, c], label2] == 1: pair_score[label1, label2] += 1.0 * (prob[label2] / (1 - prob[label1])) *\ np.abs(feature_set[n_sample[label1, 0][r], :] - feature_set[near_miss[r, c], :]) / nb for r in range(n_sample[label1, 1].__len__()): for c in range(nb): if label_set[near_hit[r, c], label2] == 1: pair_score[label1, label2] -= (prob[label2] /(1 - prob[label1])) *\ np.abs(feature_set[n_sample[label1, 1][r], :] - feature_set[near_hit[r, c], :]) / nb for label1 in range(nlabel - 1): for label2 in range(label1 + 1, nlabel): feature_score += pair_score[label1, label2] feature_score = feature_score[0] sorted_index = feature_score.argsort().tolist() sorted_index.reverse() sorted_index = np.array(sorted_index) feature_score = feature_score[sorted_index] if numf is None: numf = len(sorted_index) # Make sure we select at maximum all features numf = min(numf, len(sorted_index)) sorted_index = sorted_index[0:numf] return sorted_index, feature_score
[docs] def single_class_relief(self, feature_set, label_set, nb=3, sample_size=1, distance_p=2, numf=None, random_state=None): nrow, ncol = feature_set.shape np.random.seed(random_state) sample_list = np.random.choice(range(nrow), int(nrow * sample_size), replace=False) feature_score = np.zeros((1, ncol)) n_sample = dict() # find positive and negative samples for each label n_sample[0] = [] n_sample[1] = [] for row in sample_list: if label_set[row] == 0: n_sample[0].append(row) else: n_sample[1].append(row) if n_sample[0].__len__() >= nb and n_sample[1].__len__() >= nb: # find near miss for label1 n_neighbor_finder = nn.NearestNeighbors(n_neighbors=nb, p=distance_p) n_neighbor_finder.fit(np.asarray(feature_set[n_sample[0], :])) near_miss = n_neighbor_finder.kneighbors(np.asarray(feature_set[n_sample[0], :]), return_distance=False) # find near hit for label1 p_neighbor_finder = nn.NearestNeighbors(n_neighbors=nb, p=distance_p) p_neighbor_finder.fit(np.asarray(feature_set[n_sample[1], :])) near_hit = p_neighbor_finder.kneighbors(np.asarray(feature_set[n_sample[1], :]), return_distance=False) for r in range(near_miss.__len__()): for c in range(nb): if label_set[near_miss[r, c]] == 1: feature_score += 1.0 * np.abs(feature_set[n_sample[0][r], :] - feature_set[near_miss[r, c], :]) / nb for r in range(n_sample[1].__len__()): for c in range(nb): if label_set[near_hit[r, c]] == 1: feature_score -= 1.0 * np.abs(feature_set[n_sample[1][r], :] - feature_set[near_hit[r, c], :]) / nb feature_score = feature_score[0] sorted_index = feature_score.argsort().tolist() sorted_index.reverse() sorted_index = np.array(sorted_index) feature_score = feature_score[sorted_index] if numf is None: numf = len(sorted_index) # Make sure we select at maximum all features numf = min(numf, len(sorted_index)) sorted_index = sorted_index[0:numf] return sorted_index, feature_score
# def single_class_relief_skrebate(self, feature_set, label_set, nb=3, sample_size=1, # distance_p=2, numf=None): # self.ReliefF = ReliefF(n_features_to_select=numf, n_neighbors=nb, n_jobs=1) # self.ReliefF.fit(feature_set, label_set) # return None, None