Source code for WORC.classification.trainclassifier

#!/usr/bin/env python

# Copyright 2016-2023 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import numpy as np
from scipy.stats import uniform
from WORC.classification import crossval as cv
from WORC.classification import construct_classifier as cc
from WORC.IOparser.file_io import load_features
import WORC.IOparser.config_io_classifier as config_io
from WORC.classification.AdvancedSampler import discrete_uniform, \
    log_uniform, boolean_uniform
import json


[docs]def trainclassifier(feat_train, patientinfo_train, config, output_hdf, feat_test=None, patientinfo_test=None, fixedsplits=None, output_smac=None, verbose=True): """Train a classifier using machine learning from features. By default, if no split in training and test is supplied, a cross validation will be performed. Parameters ---------- feat_train: string, mandatory contains the paths to all .hdf5 feature files used. modalityname1=file1,file2,file3,... modalityname2=file1,... Thus, modalities names are always between a space and a equal sign, files are split by commas. We assume that the lists of files for each modality has the same length. Files on the same position on each list should belong to the same patient. patientinfo: string, mandatory Contains the path referring to a .txt file containing the patient label(s) and value(s) to be used for learning. See the Github Wiki for the format. config: string, mandatory path referring to a .ini file containing the parameters used for feature extraction. See the Github Wiki for the possible fields and their description. output_hdf: string, mandatory path refering to a .hdf5 file to which the final classifier and it's properties will be written to. feat_test: string, optional When this argument is supplied, the machine learning will not be trained using a cross validation, but rather using a fixed training and text split. This field should contain paths of the test set feature files, similar to the feat_train argument. patientinfo_test: string, optional When feat_test is supplied, you can supply optionally a patient label file through which the performance will be evaluated. fixedsplits: string, optional By default, random split cross validation is used to train and evaluate the machine learning methods. Optionally, you can provide a .xlsx file containing fixed splits to be used. See the Github Wiki for the format. verbose: boolean, default True print final feature values and labels to command line or not. """ # Convert inputs from lists to strings if type(patientinfo_train) is list: patientinfo_train = ''.join(patientinfo_train) if type(patientinfo_test) is list: patientinfo_test = ''.join(patientinfo_test) if type(config) is list: if len(config) == 1: config = ''.join(config) else: # FIXME print('[WORC Warning] You provided multiple configuration files: only the first one will be used!') config = config[0] if type(output_hdf) is list: if len(output_hdf) == 1: output_hdf = ''.join(output_hdf) else: # FIXME print('[WORC Warning] You provided multiple output hdf files: only the first one will be used!') output_hdf = output_hdf[0] if type(fixedsplits) is list: fixedsplits = ''.join(fixedsplits) if type(output_smac) is list: if len(output_smac) == 1: output_smac = ''.join(output_smac) else: # FIXME print('[WORC Warning] You provided multiple output json files: only the first one will be used!') output_smac = output_smac[0] # Load variables from the config file config = config_io.load_config(config) label_type = config['Labels']['label_names'] modus = config['Labels']['modus'] combine_features = config['FeatPreProcess']['Combine'] combine_method = config['FeatPreProcess']['Combine_method'] # Load the feature files and match to label data label_data_train, image_features_train =\ load_features(feat_train, patientinfo_train, label_type, combine_features, combine_method) if feat_test: label_data_test, image_features_test =\ load_features(feat_test, patientinfo_test, label_type, combine_features, combine_method) # Create tempdir name from patientinfo file name basename = os.path.basename(patientinfo_train) filename, _ = os.path.splitext(basename) path = patientinfo_train for i in range(4): # Use temp dir: result -> sample# -> parameters - > temppath path = os.path.dirname(path) _, path = os.path.split(path) path = os.path.join(path, 'trainclassifier', filename) # Construct the required classifier grid param_grid = cc.create_param_grid(config) # Add non-classifier parameters param_grid = add_parameters_to_grid(param_grid, config) # Delete parameters for hyperoptimization which already have been used del config['HyperOptimization']['fix_random_seed'] # For N_iter, perform k-fold crossvalidation outputfolder = os.path.dirname(output_hdf) smac_result_file = output_smac if feat_test is None: trained_classifier = cv.crossval(config, label_data_train, image_features_train, param_grid, modus=modus, use_fastr=config['Classification']['fastr'], fastr_plugin=config['Classification']['fastr_plugin'], fixedsplits=fixedsplits, ensemble=config['Ensemble'], outputfolder=outputfolder, tempsave=config['General']['tempsave'], use_SMAC=config['SMAC']['use'], smac_result_file=smac_result_file) else: trained_classifier = cv.nocrossval(config, label_data_train, label_data_test, image_features_train, image_features_test, param_grid, modus=modus, use_fastr=config['Classification']['fastr'], fastr_plugin=config['Classification']['fastr_plugin'], ensemble=config['Ensemble']) if not os.path.exists(os.path.dirname(output_hdf)): os.makedirs(os.path.dirname(output_hdf)) trained_classifier.to_hdf(output_hdf, 'EstimatorData') print("Saved data!")
[docs]def add_parameters_to_grid(param_grid, config): """Add non-classifier parameters from config to param grid.""" # IF at least once groupwise search is turned on, add it to the param grid if 'True' in config['Featsel']['GroupwiseSearch']: param_grid['SelectGroups'] = config['Featsel']['GroupwiseSearch'] for group in config['SelectFeatGroup'].keys(): param_grid[group] = config['SelectFeatGroup'][group] # Add feature scaling parameters param_grid['FeatureScaling'] = config['FeatureScaling']['scaling_method'] param_grid['FeatureScaling_skip_features'] =\ [config['FeatureScaling']['skip_features']] # Add parameters for oversampling methods param_grid['Resampling_Use'] =\ boolean_uniform(threshold=config['Resampling']['Use']) param_grid['Resampling_Method'] = config['Resampling']['Method'] param_grid['Resampling_sampling_strategy'] =\ config['Resampling']['sampling_strategy'] param_grid['Resampling_n_neighbors'] =\ discrete_uniform(loc=config['Resampling']['n_neighbors'][0], scale=config['Resampling']['n_neighbors'][1]) param_grid['Resampling_k_neighbors'] =\ discrete_uniform(loc=config['Resampling']['k_neighbors'][0], scale=config['Resampling']['k_neighbors'][1]) param_grid['Resampling_threshold_cleaning'] =\ uniform(loc=config['Resampling']['threshold_cleaning'][0], scale=config['Resampling']['threshold_cleaning'][1]) param_grid['Resampling_n_cores'] = [config['General']['Joblib_ncores']] # Extract hyperparameter grid settings for SearchCV from config param_grid['FeatPreProcess'] = config['FeatPreProcess']['Use'] param_grid['Featsel_Variance'] =\ boolean_uniform(threshold=config['Featsel']['Variance']) param_grid['OneHotEncoding'] = config['OneHotEncoding']['Use'] param_grid['OneHotEncoding_feature_labels_tofit'] =\ [config['OneHotEncoding']['feature_labels_tofit']] param_grid['Imputation'] = config['Imputation']['use'] param_grid['ImputationMethod'] = config['Imputation']['strategy'] param_grid['ImputationSkipAllNaN'] = config['Imputation']['skipallNaN'] param_grid['ImputationNeighbours'] =\ discrete_uniform(loc=config['Imputation']['n_neighbors'][0], scale=config['Imputation']['n_neighbors'][1]) param_grid['SelectFromModel'] =\ boolean_uniform(threshold=config['Featsel']['SelectFromModel']) param_grid['SelectFromModel_lasso_alpha'] =\ uniform(loc=config['Featsel']['SelectFromModel_lasso_alpha'][0], scale=config['Featsel']['SelectFromModel_lasso_alpha'][1]) param_grid['SelectFromModel_estimator'] =\ config['Featsel']['SelectFromModel_estimator'] param_grid['SelectFromModel_n_trees'] =\ discrete_uniform(loc=config['Featsel']['SelectFromModel_n_trees'][0], scale=config['Featsel']['SelectFromModel_n_trees'][1]) param_grid['RFE'] =\ boolean_uniform(threshold=config['Featsel']['RFE']) param_grid['RFE_lasso_alpha'] =\ uniform(loc=config['Featsel']['RFE_lasso_alpha'][0], scale=config['Featsel']['RFE_lasso_alpha'][1]) param_grid['RFE_estimator'] =\ config['Featsel']['RFE_estimator'] param_grid['RFE_n_trees'] =\ discrete_uniform(loc=config['Featsel']['RFE_n_trees'][0], scale=config['Featsel']['RFE_n_trees'][1]) param_grid['RFE_n_features_to_select'] =\ discrete_uniform(loc=config['Featsel']['RFE_n_features_to_select'][0], scale=config['Featsel']['RFE_n_features_to_select'][1]) param_grid['RFE_step'] =\ discrete_uniform(loc=config['Featsel']['RFE_step'][0], scale=config['Featsel']['RFE_step'][1]) param_grid['UsePCA'] =\ boolean_uniform(threshold=config['Featsel']['UsePCA']) param_grid['PCAType'] = config['Featsel']['PCAType'] param_grid['StatisticalTestUse'] =\ boolean_uniform(threshold=config['Featsel']['StatisticalTestUse']) param_grid['StatisticalTestMetric'] =\ config['Featsel']['StatisticalTestMetric'] param_grid['StatisticalTestThreshold'] =\ log_uniform(loc=config['Featsel']['StatisticalTestThreshold'][0], scale=config['Featsel']['StatisticalTestThreshold'][1]) param_grid['ReliefUse'] =\ boolean_uniform(threshold=config['Featsel']['ReliefUse']) param_grid['ReliefNN'] =\ discrete_uniform(loc=config['Featsel']['ReliefNN'][0], scale=config['Featsel']['ReliefNN'][1]) param_grid['ReliefSampleSize'] =\ uniform(loc=config['Featsel']['ReliefSampleSize'][0], scale=config['Featsel']['ReliefSampleSize'][1]) param_grid['ReliefDistanceP'] =\ discrete_uniform(loc=config['Featsel']['ReliefDistanceP'][0], scale=config['Featsel']['ReliefDistanceP'][1]) param_grid['ReliefNumFeatures'] =\ discrete_uniform(loc=config['Featsel']['ReliefNumFeatures'][0], scale=config['Featsel']['ReliefNumFeatures'][1]) # Add a random seed, which is required for many methods if config['HyperOptimization']['fix_random_seed']: # Fix the random seed param_grid['random_seed'] = [22] else: param_grid['random_seed'] =\ discrete_uniform(loc=0, scale=2**32 - 1) return param_grid