Source code for WORC.IOparser.file_io

#!/usr/bin/env python

# Copyright 2016-2021 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import pandas as pd
import WORC.processing.label_processing as lp
import WORC.addexceptions as WORCexceptions
import numpy as np
import os


[docs]def load_data(featurefiles, patientinfo=None, label_names=None, modnames=[],
              combine_features=False, combine_method='mean'):
    """Read feature files and stack the features per patient in an array.

        Additionally, if a patient label file is supplied, the features from
        a patient will be matched to the labels.

        Parameters
        ----------
        featurefiles: list, mandatory
                List containing all paths to the .hdf5 feature files to be loaded.
                The argument should contain a list per modelity, e.g.
                [[features_mod1_patient1, features_mod1_patient2, ...],
                 [features_mod2_patient1, features_mod2_patient2, ...]].

        patientinfo: string, optional
                Path referring to the .txt file to be used to read patient
                labels from. See the Github Wiki for the format.

        label_names: list, optional
                List containing all the labels that should be extracted from
                the patientinfo file.

        combine_features: boolean, default False
                Determines whether to combine the features from all samples
                of the same patient or not.

        combine_methods: string, mean or max
                If features per patient should be combined, determine how.
    """
    # Read out all feature values and labels
    image_features_temp = list()
    feature_labels_all = list()
    pids = list()
    for i_patient in range(0, len(featurefiles[0])):
        feature_values_temp = list()
        feature_labels_temp = list()
        for i_mod in range(0, len(featurefiles)):
            feat_temp = pd.read_hdf(featurefiles[i_mod][i_patient])
            feature_values_temp += feat_temp.feature_values
            if not modnames:
                # Create artificial names
                feature_labels_temp += [f + '_M' + str(i_mod) for f in feat_temp.feature_labels]
            else:
                # Use the provides modality names
                feature_labels_temp += [f + '_' + str(modnames[i_mod]) for f in feat_temp.feature_labels]

        image_features_temp.append((feature_values_temp, feature_labels_temp))

        # Also make a list of all unique label names
        feature_labels_all = feature_labels_all + list(set(feature_labels_temp) - set(feature_labels_all))

        # If PID in feature file, use those
        if 'patient' in list(feat_temp.keys()):
            pids.append(feat_temp.patient)

    # Check when we found patient ID's, if we did for all objects
    if pids:
        if len(pids) != len(image_features_temp):
            raise WORCexceptions.WORCValueError(f'Length of pids {len(pids)}' +
                                                'does not match' +
                                                'number of objects ' +
                                                str(len(image_features_temp)) +
                                                f'Found {pids}.')

    # If some objects miss certain features, we will identify these with NaN values
    feature_labels_all.sort()
    image_features = list()
    for patient in image_features_temp:
        feat_temp = patient[0]
        label_temp = patient[1]

        feat = list()
        for f in feature_labels_all:
            if f in label_temp:
                index = label_temp.index(f)
                fv = feat_temp[index]
            else:
                fv = np.NaN
            feat.append(fv)

        image_features.append((feat, feature_labels_all))

    # Get the labels and patient IDs
    if patientinfo is not None:
        # We use the feature files of the first modality to match to patient name
        pfiles = featurefiles[0]
        try:
            if pids:
                label_data, image_features =\
                    lp.findlabeldata(patientinfo,
                                     label_names,
                                     pids=pids,
                                     objects=image_features)
            else:
                label_data, image_features =\
                    lp.findlabeldata(patientinfo,
                                     label_names,
                                     filenames=pfiles,
                                     objects=image_features)
        except ValueError as e:
            message = str(e) + '. Please take a look at your labels' +\
                ' file and make sure it is formatted correctly. ' +\
                r'See also https://worc.readthedocs.io/en/latest/static/configuration.html#config-labels.'
            raise WORCexceptions.WORCValueError(message)

        if len(label_names) == 1:
            print("Labels:")
            print(label_data['label'])
            print('Total of ' + str(label_data['patient_IDs'].shape[0]) +
                  ' patients')
            pos = np.sum(label_data['label'])
            neg = label_data['patient_IDs'].shape[0] - pos
            print(('{} positives, {} negatives').format(pos, neg))
    else:
        # Use filenames as patient ID s
        patient_IDs = list()
        for i in featurefiles[0]:
            patient_IDs.append(os.path.basename(i))
        label_data = dict()
        label_data['patient_IDs'] = patient_IDs

    # Optionally, combine features of same patient
    if combine_features:
        print('Combining features of the same patient.')
        feature_labels = image_features[0][1]
        label_name = label_data['label_name']
        new_label_data = list()
        new_pids = list()
        new_features = list()
        pid_length = len(label_data['patient_IDs'])
        print(f'\tOriginal number of samples / patients: {pid_length}.')

        already_processed = list()
        for pnum, pid in enumerate(label_data['patient_IDs']):
            if pid not in already_processed:
                # NOTE: should check whether we have already processed this patient
                occurrences = list(label_data['patient_IDs']).count(pid)

                # NOTE: Assume all object from one patient have the same label
                label = label_data['label'][0][pnum]
                new_label_data.append(label)
                new_pids.append(pid)

                # Only process patients which occur multiple times
                if occurrences > 1:
                    print(f'\tFound {occurrences} occurrences for {pid}.')
                    indices = [i for i, x in enumerate(label_data['patient_IDs']) if x == pid]
                    feature_values_thispatient = np.asarray([image_features[i][0] for i in indices])
                    if combine_method == 'mean':
                        feature_values_thispatient = np.nanmean(feature_values_thispatient, axis=0).tolist()
                    else:
                        raise WORCexceptions.KeyError(f'{combine_method} is not a valid combination method, should be mean or max.')
                    features = (feature_values_thispatient, feature_labels)

                    # And add the new one
                    new_features.append(features)
                else:
                    new_features.append(image_features[pnum])

                already_processed.append(pid)

        # Adjust the labels and features for further processing
        label_data = dict()
        label_data['patient_IDs'] = np.asarray(new_pids)
        label_data['label'] = np.asarray([new_label_data])
        label_data['label_name'] = label_name

        image_features = new_features

        pid_length = len(label_data['patient_IDs'])
        print(f'\tNumber of samples / patients after combining: {pid_length}.')

    return label_data, image_features


[docs]def load_features(feat, patientinfo, label_type, combine_features=False,
                  combine_method='mean'):
    """Read feature files and stack the features per patient in an array.

        Additionally, if a patient label file is supplied, the features from
        a patient will be matched to the labels.

        Parameters
        ----------
        featurefiles: list, mandatory
                List containing all paths to the .hdf5 feature files to be loaded.
                The argument should contain a list per modelity, e.g.
                [[features_mod1_patient1, features_mod1_patient2, ...],
                 [features_mod2_patient1, features_mod2_patient2, ...]].

        patientinfo: string, optional
                Path referring to the .txt file to be used to read patient
                labels from. See the Github Wiki for the format.

        label_names: list, optional
                List containing all the labels that should be extracted from
                the patientinfo file.

        combine_features: boolean, default False
                Determines whether to combine the features from all samples
                of the same patient or not.

        combine_methods: string, mean or max
                If features per patient should be combined, determine how.

    """
    # Check if features is a simple list, or just one string
    if '=' not in feat[0]:
        feat = ['Mod0=' + ','.join(feat)]

    # Split the feature files per modality
    feat_temp = list()
    modnames = list()
    for feat_mod in feat:
        feat_mod_temp = [str(item).strip() for item in feat_mod.split(',')]

        # The first item contains the name of the modality, followed by a = sign
        temp = [str(item).strip() for item in feat_mod_temp[0].split('=')]
        modnames.append(temp[0])
        feat_mod_temp[0] = temp[1]

        # Append the files to the main list
        feat_temp.append(feat_mod_temp)

    feat = feat_temp

    # Read the features and classification data
    label_data, image_features =\
        load_data(feat, patientinfo,
                  label_type, modnames,
                  combine_features,
                  combine_method)

    return label_data, image_features


[docs]def convert_config_pyradiomics(config):
    """Convert WORC to PyRadiomics config.

    Convert fields from WORC confiparser object to a PyRadiomics
    compatible dictionary.
    """
    # Creatae main config structure
    outputconfig = dict()
    outputconfig['imageType'] = dict()
    outputconfig['setting'] = dict()
    outputconfig['featureClass'] = dict()

    # Take out the specific PyRadiomics values
    outputconfig['setting']['geometryTolerance'] =\
        float(config['PyRadiomics']['geometryTolerance'])

    if config['PyRadiomics']['normalize'] == 'True':
        outputconfig['setting']['normalize'] = True
    else:
        outputconfig['setting']['normalize'] = False

    outputconfig['setting']['normalizeScale'] =\
        int(config['PyRadiomics']['normalizeScale'])

    outputconfig['setting']['interpolator'] =\
        config['PyRadiomics']['interpolator']

    if config['PyRadiomics']['preCrop'] == 'True':
        outputconfig['setting']['preCrop'] = True
    else:
        outputconfig['setting']['preCrop'] = False

    outputconfig['setting']['label'] =\
        int(config['PyRadiomics']['label'])

    if config['PyRadiomics']['force2D'] == 'True':
        outputconfig['setting']['force2D'] = True
    else:
        outputconfig['setting']['force2D'] = False

    outputconfig['setting']['force2Ddimension'] =\
        int(config['PyRadiomics']['force2Ddimension'])

    outputconfig['setting']['voxelArrayShift'] =\
        int(config['PyRadiomics']['voxelArrayShift'])

    if config['PyRadiomics']['binCount'] == 'None':
        outputconfig['setting']['binCount'] = None
    else:
        outputconfig['setting']['binCount'] =\
            int(config['PyRadiomics']['binCount'])

    if config['PyRadiomics']['binWidth'] == 'None':
        outputconfig['setting']['binWidth'] = None
    else:
        outputconfig['setting']['binWidth'] =\
            float(config['PyRadiomics']['binWidth'])

    if config['PyRadiomics']['resampledPixelSpacing'] == 'None':
        outputconfig['setting']['resampledPixelSpacing'] = None
    else:
        outputconfig['setting']['resampledPixelSpacing'] =\
            [float(i) for i in config['PyRadiomics']['resampledPixelSpacing'].split(',')]
        if len(outputconfig['setting']['resampledPixelSpacing']) != 3:
            length = len(outputconfig['setting']['resampledPixelSpacing'])
            raise WORCexceptions.WORCValueError(f'Length PyRadiomics resampledPixelSpacing should be 3, got {length}.')

    # Extract several general values as well
    # Convert strings with values to list of ints
    distances = config['ImageFeatures']['GLCM_distances']
    distances = distances.split(',')
    distances = [int(s) for s in distances]
    outputconfig['setting']['distances'] = distances

    # Check if we need to apply transforms to the image
    if config['PyRadiomics']['Original'] == 'True':
        outputconfig['imageType']['Original'] = dict()

    if config['PyRadiomics']['Wavelet'] == 'True':
        outputconfig['imageType']['Wavelet'] = dict()

    if config['PyRadiomics']['LoG'] == 'True':
        outputconfig['imageType']['LoG'] = dict()
        sigmas = config['ImageFeatures']['log_sigma']
        sigmas = sigmas.split(',')
        sigmas = [int(s) for s in sigmas]
        outputconfig['imageType']['LoG']['sigma'] = sigmas

    # Determine which features to extract:
    if config['PyRadiomics']['extract_firstorder'] == 'True':
        outputconfig['featureClass']['firstorder'] = None
    if config['PyRadiomics']['extract_shape'] == 'True':
        outputconfig['featureClass']['shape'] = None
    if config['PyRadiomics']['texture_GLCM'] == 'True':
        outputconfig['featureClass']['glcm'] = None
    if config['PyRadiomics']['texture_GLRLM'] == 'True':
        outputconfig['featureClass']['glrlm'] = None
    if config['PyRadiomics']['texture_GLSZM'] == 'True':
        outputconfig['featureClass']['glszm'] = None
    if config['PyRadiomics']['texture_GLDM'] == 'True':
        outputconfig['featureClass']['gldm'] = None
    if config['PyRadiomics']['texture_NGTDM'] == 'True':
        outputconfig['featureClass']['ngtdm'] = None

    return outputconfig