Source code for WORC.IOparser.file_io

#!/usr/bin/env python

# Copyright 2016-2021 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import pandas as pd
import WORC.processing.label_processing as lp
import WORC.addexceptions as WORCexceptions
import numpy as np
import os


[docs]def load_data(featurefiles, patientinfo=None, label_names=None, modnames=[], combine_features=False, combine_method='mean'): """Read feature files and stack the features per patient in an array. Additionally, if a patient label file is supplied, the features from a patient will be matched to the labels. Parameters ---------- featurefiles: list, mandatory List containing all paths to the .hdf5 feature files to be loaded. The argument should contain a list per modelity, e.g. [[features_mod1_patient1, features_mod1_patient2, ...], [features_mod2_patient1, features_mod2_patient2, ...]]. patientinfo: string, optional Path referring to the .txt file to be used to read patient labels from. See the Github Wiki for the format. label_names: list, optional List containing all the labels that should be extracted from the patientinfo file. combine_features: boolean, default False Determines whether to combine the features from all samples of the same patient or not. combine_methods: string, mean or max If features per patient should be combined, determine how. """ # Read out all feature values and labels image_features_temp = list() feature_labels_all = list() pids = list() for i_patient in range(0, len(featurefiles[0])): feature_values_temp = list() feature_labels_temp = list() for i_mod in range(0, len(featurefiles)): feat_temp = pd.read_hdf(featurefiles[i_mod][i_patient]) feature_values_temp += feat_temp.feature_values if not modnames: # Create artificial names feature_labels_temp += [f + '_M' + str(i_mod) for f in feat_temp.feature_labels] else: # Use the provides modality names feature_labels_temp += [f + '_' + str(modnames[i_mod]) for f in feat_temp.feature_labels] image_features_temp.append((feature_values_temp, feature_labels_temp)) # Also make a list of all unique label names feature_labels_all = feature_labels_all + list(set(feature_labels_temp) - set(feature_labels_all)) # If PID in feature file, use those if 'patient' in list(feat_temp.keys()): pids.append(feat_temp.patient) # Check when we found patient ID's, if we did for all objects if pids: if len(pids) != len(image_features_temp): raise WORCexceptions.WORCValueError(f'Length of pids {len(pids)}' + 'does not match' + 'number of objects ' + str(len(image_features_temp)) + f'Found {pids}.') # If some objects miss certain features, we will identify these with NaN values feature_labels_all.sort() image_features = list() for patient in image_features_temp: feat_temp = patient[0] label_temp = patient[1] feat = list() for f in feature_labels_all: if f in label_temp: index = label_temp.index(f) fv = feat_temp[index] else: fv = np.NaN feat.append(fv) image_features.append((feat, feature_labels_all)) # Get the labels and patient IDs if patientinfo is not None: # We use the feature files of the first modality to match to patient name pfiles = featurefiles[0] try: if pids: label_data, image_features =\ lp.findlabeldata(patientinfo, label_names, pids=pids, objects=image_features) else: label_data, image_features =\ lp.findlabeldata(patientinfo, label_names, filenames=pfiles, objects=image_features) except ValueError as e: message = str(e) + '. Please take a look at your labels' +\ ' file and make sure it is formatted correctly. ' +\ r'See also https://worc.readthedocs.io/en/latest/static/configuration.html#config-labels.' raise WORCexceptions.WORCValueError(message) if len(label_names) == 1: print("Labels:") print(label_data['label']) print('Total of ' + str(label_data['patient_IDs'].shape[0]) + ' patients') pos = np.sum(label_data['label']) neg = label_data['patient_IDs'].shape[0] - pos print(('{} positives, {} negatives').format(pos, neg)) else: # Use filenames as patient ID s patient_IDs = list() for i in featurefiles[0]: patient_IDs.append(os.path.basename(i)) label_data = dict() label_data['patient_IDs'] = patient_IDs # Optionally, combine features of same patient if combine_features: print('Combining features of the same patient.') feature_labels = image_features[0][1] label_name = label_data['label_name'] new_label_data = list() new_pids = list() new_features = list() pid_length = len(label_data['patient_IDs']) print(f'\tOriginal number of samples / patients: {pid_length}.') already_processed = list() for pnum, pid in enumerate(label_data['patient_IDs']): if pid not in already_processed: # NOTE: should check whether we have already processed this patient occurrences = list(label_data['patient_IDs']).count(pid) # NOTE: Assume all object from one patient have the same label label = label_data['label'][0][pnum] new_label_data.append(label) new_pids.append(pid) # Only process patients which occur multiple times if occurrences > 1: print(f'\tFound {occurrences} occurrences for {pid}.') indices = [i for i, x in enumerate(label_data['patient_IDs']) if x == pid] feature_values_thispatient = np.asarray([image_features[i][0] for i in indices]) if combine_method == 'mean': feature_values_thispatient = np.nanmean(feature_values_thispatient, axis=0).tolist() else: raise WORCexceptions.KeyError(f'{combine_method} is not a valid combination method, should be mean or max.') features = (feature_values_thispatient, feature_labels) # And add the new one new_features.append(features) else: new_features.append(image_features[pnum]) already_processed.append(pid) # Adjust the labels and features for further processing label_data = dict() label_data['patient_IDs'] = np.asarray(new_pids) label_data['label'] = np.asarray([new_label_data]) label_data['label_name'] = label_name image_features = new_features pid_length = len(label_data['patient_IDs']) print(f'\tNumber of samples / patients after combining: {pid_length}.') return label_data, image_features
[docs]def load_features(feat, patientinfo, label_type, combine_features=False, combine_method='mean'): """Read feature files and stack the features per patient in an array. Additionally, if a patient label file is supplied, the features from a patient will be matched to the labels. Parameters ---------- featurefiles: list, mandatory List containing all paths to the .hdf5 feature files to be loaded. The argument should contain a list per modelity, e.g. [[features_mod1_patient1, features_mod1_patient2, ...], [features_mod2_patient1, features_mod2_patient2, ...]]. patientinfo: string, optional Path referring to the .txt file to be used to read patient labels from. See the Github Wiki for the format. label_names: list, optional List containing all the labels that should be extracted from the patientinfo file. combine_features: boolean, default False Determines whether to combine the features from all samples of the same patient or not. combine_methods: string, mean or max If features per patient should be combined, determine how. """ # Check if features is a simple list, or just one string if '=' not in feat[0]: feat = ['Mod0=' + ','.join(feat)] # Split the feature files per modality feat_temp = list() modnames = list() for feat_mod in feat: feat_mod_temp = [str(item).strip() for item in feat_mod.split(',')] # The first item contains the name of the modality, followed by a = sign temp = [str(item).strip() for item in feat_mod_temp[0].split('=')] modnames.append(temp[0]) feat_mod_temp[0] = temp[1] # Append the files to the main list feat_temp.append(feat_mod_temp) feat = feat_temp # Read the features and classification data label_data, image_features =\ load_data(feat, patientinfo, label_type, modnames, combine_features, combine_method) return label_data, image_features
[docs]def convert_config_pyradiomics(config): """Convert WORC to PyRadiomics config. Convert fields from WORC confiparser object to a PyRadiomics compatible dictionary. """ # Creatae main config structure outputconfig = dict() outputconfig['imageType'] = dict() outputconfig['setting'] = dict() outputconfig['featureClass'] = dict() # Take out the specific PyRadiomics values outputconfig['setting']['geometryTolerance'] =\ float(config['PyRadiomics']['geometryTolerance']) if config['PyRadiomics']['normalize'] == 'True': outputconfig['setting']['normalize'] = True else: outputconfig['setting']['normalize'] = False outputconfig['setting']['normalizeScale'] =\ int(config['PyRadiomics']['normalizeScale']) outputconfig['setting']['interpolator'] =\ config['PyRadiomics']['interpolator'] if config['PyRadiomics']['preCrop'] == 'True': outputconfig['setting']['preCrop'] = True else: outputconfig['setting']['preCrop'] = False outputconfig['setting']['label'] =\ int(config['PyRadiomics']['label']) if config['PyRadiomics']['force2D'] == 'True': outputconfig['setting']['force2D'] = True else: outputconfig['setting']['force2D'] = False outputconfig['setting']['force2Ddimension'] =\ int(config['PyRadiomics']['force2Ddimension']) outputconfig['setting']['voxelArrayShift'] =\ int(config['PyRadiomics']['voxelArrayShift']) if config['PyRadiomics']['binCount'] == 'None': outputconfig['setting']['binCount'] = None else: outputconfig['setting']['binCount'] =\ int(config['PyRadiomics']['binCount']) if config['PyRadiomics']['binWidth'] == 'None': outputconfig['setting']['binWidth'] = None else: outputconfig['setting']['binWidth'] =\ float(config['PyRadiomics']['binWidth']) if config['PyRadiomics']['resampledPixelSpacing'] == 'None': outputconfig['setting']['resampledPixelSpacing'] = None else: outputconfig['setting']['resampledPixelSpacing'] =\ [float(i) for i in config['PyRadiomics']['resampledPixelSpacing'].split(',')] if len(outputconfig['setting']['resampledPixelSpacing']) != 3: length = len(outputconfig['setting']['resampledPixelSpacing']) raise WORCexceptions.WORCValueError(f'Length PyRadiomics resampledPixelSpacing should be 3, got {length}.') # Extract several general values as well # Convert strings with values to list of ints distances = config['ImageFeatures']['GLCM_distances'] distances = distances.split(',') distances = [int(s) for s in distances] outputconfig['setting']['distances'] = distances # Check if we need to apply transforms to the image if config['PyRadiomics']['Original'] == 'True': outputconfig['imageType']['Original'] = dict() if config['PyRadiomics']['Wavelet'] == 'True': outputconfig['imageType']['Wavelet'] = dict() if config['PyRadiomics']['LoG'] == 'True': outputconfig['imageType']['LoG'] = dict() sigmas = config['ImageFeatures']['log_sigma'] sigmas = sigmas.split(',') sigmas = [int(s) for s in sigmas] outputconfig['imageType']['LoG']['sigma'] = sigmas # Determine which features to extract: if config['PyRadiomics']['extract_firstorder'] == 'True': outputconfig['featureClass']['firstorder'] = None if config['PyRadiomics']['extract_shape'] == 'True': outputconfig['featureClass']['shape'] = None if config['PyRadiomics']['texture_GLCM'] == 'True': outputconfig['featureClass']['glcm'] = None if config['PyRadiomics']['texture_GLRLM'] == 'True': outputconfig['featureClass']['glrlm'] = None if config['PyRadiomics']['texture_GLSZM'] == 'True': outputconfig['featureClass']['glszm'] = None if config['PyRadiomics']['texture_GLDM'] == 'True': outputconfig['featureClass']['gldm'] = None if config['PyRadiomics']['texture_NGTDM'] == 'True': outputconfig['featureClass']['ngtdm'] = None return outputconfig