Source code for WORC.featureprocessing.ICCThreshold

#!/usr/bin/env python

# Copyright 2016-2021 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from sklearn.base import BaseEstimator
from sklearn.feature_selection import SelectorMixin
import numpy as np
from WORC.classification.metrics import ICC
import WORC.IOparser.file_io as wio
import pandas as pd


[docs]class ICCThreshold(BaseEstimator, SelectorMixin): ''' Object to fit feature selection based on intra- or inter-class correlation coefficient as defined by Shrout, Patrick E., and Joseph L. Fleiss. "Intraclass correlations: uses in assessing rater reliability." Psychological bulletin 86.2 (1979): 420. http://rokwa.x-y.net/Shrout-Fleiss-ICC.pdf For the intra-class, we use ICC(3,1).For the inter-class ICC, we should use ICC(2,1) according to definitions of the paper, but according to radiomics literatue (https://www.tandfonline.com/doi/pdf/10.1080/0284186X.2018.1445283?needAccess=true, https://www.tandfonline.com/doi/pdf/10.3109/0284186X.2013.812798?needAccess=true), we use ICC(3,1) anyway. The default threshold of 0.75 is also based on the literature metioned above. '''
[docs] def __init__(self, ICCtype='intra', threshold=0.75): ''' Parameters ---------- ICCtype: string, default 'intra' Type of ICC used. intra results in ICC(3,1), inter in ICC(2,1) threshold: float, default 0.75 Threshold for ICC-value in order for feature to be selected ''' self.ICCtype = ICCtype self.threshold = threshold
[docs] def fit(self, X_trains): ''' Select only features specificed by the metric and threshold per patient. Parameters ---------- X_trains: numpy array, mandatory Array containing feature values used for model_selection. Number of objects on first axis, features on second axis, observers on third axis. Y_train: numpy array, mandatory Array containing the binary labels for each object in X_train. ''' self.selectrows = list() self.metric_values = list() # Perform the statistical test for each feature n_patient = X_trains.shape[0] n_feat = X_trains.shape[1] n_observers = X_trains.shape[2] for i_feat in range(0, n_feat): # Select only this specific feature for all objects fv = np.empty((n_patient, n_observers)) for i_obs in range(0, n_observers): fv[:, i_obs] = X_trains[:, i_feat, i_obs] # Compute the ICC try: metric_value = ICC(fv, self.ICCtype) except ValueError as e: print("[WORC Warning] " + str(e) + '. Replacing metric value by 1.') metric_value = 1 self.metric_values.append(metric_value) if metric_value > self.threshold: self.selectrows.append(i_feat)
[docs] def transform(self, inputarray): ''' Transform the inputarray to select only the features based on the result from the fit function. Parameters ---------- inputarray: numpy array, mandatory Array containing the items to use selection on. The type of item in this list does not matter, e.g. floats, strings etc. ''' return np.asarray([np.asarray(x)[self.selectrows].tolist() for x in inputarray])
def _get_support_mask(self): # NOTE: metric is required for the Selector class, but can be empty pass
[docs]def convert_features_ICC_threshold(features_in, csv_out=None, features_out=None, threshold=0.75): ''' For features from multiple observers, compute ICC, return values, and optionally apply thresholding and save output. features_in: list, containing one list per observer. csv_out: csv file, name of file to which ICC values should be written features_out: list, containing file names of output features. ''' # Load the features per observer all_features = list() for fnum, feat_obs in enumerate(features_in): # Load features _, image_features =\ wio.load_data([feat_obs]) # Assume all observers have the same features extracted if fnum == 0: feature_labels = image_features[0][1] # Extract only feature values image_features = [i[0] for i in image_features] all_features.append(image_features) # Convert to array and transpose, so we have (patient, feat, observer) all_features = np.asarray(all_features) all_features = np.transpose(all_features, [1, 2, 0]) # Compute the ICC print('Computing ICC.') ICCthresholder = ICCThreshold(threshold=threshold) ICCthresholder.fit(all_features) # Extract the metric values and save to csv if required if csv_out: print('\t Saving ICC metric values to csv.') ICCs = ICCthresholder.metric_values df = pd.DataFrame(zip(feature_labels, ICCs), columns=['feature_label', 'ICC']) df.to_csv(csv_out) # Save the thresholded features if required: if features_out: print('\t Saving selected features to hdf5.') # Select feature labels fl = ICCthresholder.transform([feature_labels])[0] for i_obs in range(all_features.shape[2]): # Select/hreshold feature values fv = np.squeeze(all_features[:, :, i_obs]) fv = ICCthresholder.transform(fv) for i_patient in range(all_features.shape[0]): # Extract feature values for this patient fv_pat = np.squeeze(fv[i_patient, :]) # Convert to pandas Series and save as hdf5 panda_data = pd.Series([fv_pat.tolist(), fl.tolist()], index=['feature_values', 'feature_labels'], name='Image features' ) output = features_out[i_obs][i_patient] print(f'Saving image features to {output}.') panda_data.to_hdf(output, 'image_features')