Source code for WORC.classification.trainclassifier

#!/usr/bin/env python

# Copyright 2016-2021 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from scipy.stats import uniform
from WORC.classification import crossval as cv
from WORC.classification import construct_classifier as cc
from WORC.IOparser.file_io import load_features
import WORC.IOparser.config_io_classifier as config_io
from WORC.classification.AdvancedSampler import discrete_uniform, \
    log_uniform, boolean_uniform


[docs]def trainclassifier(feat_train, patientinfo_train, config,
                    output_hdf,
                    feat_test=None, patientinfo_test=None,
                    fixedsplits=None, verbose=True):
    """Train a classifier using machine learning from features.

    By default, if no
    split in training and test is supplied, a cross validation
    will be performed.

    Parameters
    ----------
    feat_train: string, mandatory
            contains the paths to all .hdf5 feature files used.
            modalityname1=file1,file2,file3,... modalityname2=file1,...
            Thus, modalities names are always between a space and a equal
            sign, files are split by commas. We assume that the lists of
            files for each modality has the same length. Files on the
            same position on each list should belong to the same patient.

    patientinfo: string, mandatory
            Contains the path referring to a .txt file containing the
            patient label(s) and value(s) to be used for learning. See
            the Github Wiki for the format.

    config: string, mandatory
            path referring to a .ini file containing the parameters
            used for feature extraction. See the Github Wiki for the possible
            fields and their description.

    output_hdf: string, mandatory
            path refering to a .hdf5 file to which the final classifier and
            it's properties will be written to.

    feat_test: string, optional
            When this argument is supplied, the machine learning will not be
            trained using a cross validation, but rather using a fixed training
            and text split. This field should contain paths of the test set
            feature files, similar to the feat_train argument.

    patientinfo_test: string, optional
            When feat_test is supplied, you can supply optionally a patient label
            file through which the performance will be evaluated.

    fixedsplits: string, optional
            By default, random split cross validation is used to train and
            evaluate the machine learning methods. Optionally, you can provide
            a .xlsx file containing fixed splits to be used. See the Github Wiki
            for the format.

    verbose: boolean, default True
            print final feature values and labels to command line or not.

    """
    # Convert inputs from lists to strings
    if type(patientinfo_train) is list:
        patientinfo_train = ''.join(patientinfo_train)

    if type(patientinfo_test) is list:
        patientinfo_test = ''.join(patientinfo_test)

    if type(config) is list:
        if len(config) == 1:
            config = ''.join(config)
        else:
            # FIXME
            print('[WORC Warning] You provided multiple configuration files: only the first one will be used!')
            config = config[0]

    if type(output_hdf) is list:
        if len(output_hdf) == 1:
            output_hdf = ''.join(output_hdf)
        else:
            # FIXME
            print('[WORC Warning] You provided multiple output hdf files: only the first one will be used!')
            output_hdf = output_hdf[0]

    if type(fixedsplits) is list:
        fixedsplits = ''.join(fixedsplits)

    # Load variables from the config file
    config = config_io.load_config(config)
    label_type = config['Labels']['label_names']
    modus = config['Labels']['modus']
    combine_features = config['FeatPreProcess']['Combine']
    combine_method = config['FeatPreProcess']['Combine_method']

    # Load the feature files and match to label data
    label_data_train, image_features_train =\
        load_features(feat_train, patientinfo_train, label_type,
                      combine_features, combine_method)

    if feat_test:
        label_data_test, image_features_test =\
            load_features(feat_test, patientinfo_test, label_type,
                          combine_features, combine_method)

    # Create tempdir name from patientinfo file name
    basename = os.path.basename(patientinfo_train)
    filename, _ = os.path.splitext(basename)
    path = patientinfo_train
    for i in range(4):
        # Use temp dir: result -> sample# -> parameters - > temppath
        path = os.path.dirname(path)

    _, path = os.path.split(path)
    path = os.path.join(path, 'trainclassifier', filename)

    # Construct the required classifier grid
    param_grid = cc.create_param_grid(config)

    # Add non-classifier parameters
    param_grid = add_parameters_to_grid(param_grid, config)

    # For N_iter, perform k-fold crossvalidation
    outputfolder = os.path.dirname(output_hdf)
    if feat_test is None:
        trained_classifier = cv.crossval(config, label_data_train,
                                         image_features_train,
                                         param_grid,
                                         modus=modus,
                                         use_fastr=config['Classification']['fastr'],
                                         fastr_plugin=config['Classification']['fastr_plugin'],
                                         fixedsplits=fixedsplits,
                                         ensemble=config['Ensemble'],
                                         outputfolder=outputfolder,
                                         tempsave=config['General']['tempsave'])
    else:
        trained_classifier = cv.nocrossval(config, label_data_train,
                                           label_data_test,
                                           image_features_train,
                                           image_features_test,
                                           param_grid,
                                           modus=modus,
                                           use_fastr=config['Classification']['fastr'],
                                           fastr_plugin=config['Classification']['fastr_plugin'],
                                           ensemble=config['Ensemble'])

    if not os.path.exists(os.path.dirname(output_hdf)):
        os.makedirs(os.path.dirname(output_hdf))

    trained_classifier.to_hdf(output_hdf, 'EstimatorData')

    print("Saved data!")


[docs]def add_parameters_to_grid(param_grid, config):
    """Add non-classifier parameters from config  to param grid."""
    # IF at least once groupwise search is turned on, add it to the param grid
    if 'True' in config['Featsel']['GroupwiseSearch']:
        param_grid['SelectGroups'] = config['Featsel']['GroupwiseSearch']
        for group in config['SelectFeatGroup'].keys():
            param_grid[group] = config['SelectFeatGroup'][group]

    # Add feature scaling parameters
    param_grid['FeatureScaling'] = config['FeatureScaling']['scaling_method']
    param_grid['FeatureScaling_skip_features'] =\
        [config['FeatureScaling']['skip_features']]

    # Add parameters for oversampling methods
    param_grid['Resampling_Use'] =\
        boolean_uniform(threshold=config['Resampling']['Use'])
    param_grid['Resampling_Method'] = config['Resampling']['Method']
    param_grid['Resampling_sampling_strategy'] =\
        config['Resampling']['sampling_strategy']
    param_grid['Resampling_n_neighbors'] =\
        discrete_uniform(loc=config['Resampling']['n_neighbors'][0],
                         scale=config['Resampling']['n_neighbors'][1])
    param_grid['Resampling_k_neighbors'] =\
        discrete_uniform(loc=config['Resampling']['k_neighbors'][0],
                         scale=config['Resampling']['k_neighbors'][1])
    param_grid['Resampling_threshold_cleaning'] =\
        uniform(loc=config['Resampling']['threshold_cleaning'][0],
                scale=config['Resampling']['threshold_cleaning'][1])

    param_grid['Resampling_n_cores'] = [config['General']['Joblib_ncores']]

    # Extract hyperparameter grid settings for SearchCV from config
    param_grid['FeatPreProcess'] = config['FeatPreProcess']['Use']
    param_grid['Featsel_Variance'] =\
        boolean_uniform(threshold=config['Featsel']['Variance'])

    param_grid['OneHotEncoding'] = config['OneHotEncoding']['Use']
    param_grid['OneHotEncoding_feature_labels_tofit'] =\
        [config['OneHotEncoding']['feature_labels_tofit']]

    param_grid['Imputation'] = config['Imputation']['use']
    param_grid['ImputationMethod'] = config['Imputation']['strategy']
    param_grid['ImputationNeighbours'] =\
        discrete_uniform(loc=config['Imputation']['n_neighbors'][0],
                         scale=config['Imputation']['n_neighbors'][1])

    param_grid['SelectFromModel'] =\
        boolean_uniform(threshold=config['Featsel']['SelectFromModel'])

    param_grid['SelectFromModel_lasso_alpha'] =\
        uniform(loc=config['Featsel']['SelectFromModel_lasso_alpha'][0],
                scale=config['Featsel']['SelectFromModel_lasso_alpha'][1])

    param_grid['SelectFromModel_estimator'] =\
        config['Featsel']['SelectFromModel_estimator']

    param_grid['SelectFromModel_n_trees'] =\
        discrete_uniform(loc=config['Featsel']['SelectFromModel_n_trees'][0],
                         scale=config['Featsel']['SelectFromModel_n_trees'][1])

    param_grid['UsePCA'] =\
        boolean_uniform(threshold=config['Featsel']['UsePCA'])
    param_grid['PCAType'] = config['Featsel']['PCAType']

    param_grid['StatisticalTestUse'] =\
        boolean_uniform(threshold=config['Featsel']['StatisticalTestUse'])

    param_grid['StatisticalTestMetric'] =\
        config['Featsel']['StatisticalTestMetric']
    param_grid['StatisticalTestThreshold'] =\
        log_uniform(loc=config['Featsel']['StatisticalTestThreshold'][0],
                    scale=config['Featsel']['StatisticalTestThreshold'][1])

    param_grid['ReliefUse'] =\
        boolean_uniform(threshold=config['Featsel']['ReliefUse'])

    param_grid['ReliefNN'] =\
        discrete_uniform(loc=config['Featsel']['ReliefNN'][0],
                         scale=config['Featsel']['ReliefNN'][1])

    param_grid['ReliefSampleSize'] =\
        uniform(loc=config['Featsel']['ReliefSampleSize'][0],
                scale=config['Featsel']['ReliefSampleSize'][1])

    param_grid['ReliefDistanceP'] =\
        discrete_uniform(loc=config['Featsel']['ReliefDistanceP'][0],
                         scale=config['Featsel']['ReliefDistanceP'][1])

    param_grid['ReliefNumFeatures'] =\
        discrete_uniform(loc=config['Featsel']['ReliefNumFeatures'][0],
                         scale=config['Featsel']['ReliefNumFeatures'][1])

    # Add a random seed, which is required for many methods
    param_grid['random_seed'] =\
        discrete_uniform(loc=0, scale=2**32 - 1)

    return param_grid