Source code for WORC.classification.crossval

#!/usr/bin/env python

# Copyright 2016-2022 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pandas as pd
import logging
import os
import time
from time import gmtime, strftime
import glob
import random
import json
import copy
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, LeaveOneOut
from joblib import Parallel, delayed
import WORC.addexceptions as ae
from WORC.classification.parameter_optimization import random_search_parameters, guided_search_parameters
from WORC.classification.regressors import regressors
from WORC.classification.SearchCV import RandomizedSearchCVfastr


[docs]def random_split_cross_validation(image_features, feature_labels, classes, patient_ids, n_iterations, param_grid, config, modus, test_size, start=0, save_data=None, tempsave=False, tempfolder=None, fixedsplits=None, fixed_seed=False, use_fastr=None, fastr_plugin=None, use_SMAC=False, smac_result_file=None): """Cross-validation in which data is randomly split in each iteration. Due to options of doing single-label and multi-label classification, stratified splitting, and regression, we use a manual loop instead of the default scikit-learn object. Parameters ------------ Returns ------------ """ print('Starting random-split cross-validation.') logging.debug('Starting random-split cross-validation.') if save_data is None: # Start from zero, thus empty list of previos data save_data = list() # If we are using fixed splits, set the n_iterations to the number of splits if fixedsplits is not None: n_iterations = int(fixedsplits.columns.shape[0] / 2) print(f'Fixedsplits detected, adjusting n_iterations to {n_iterations}') logging.debug(f'Fixedsplits detected, adjusting n_iterations to {n_iterations}') for i in range(start, n_iterations): print(('Cross-validation iteration {} / {} .').format(str(i + 1), str(n_iterations))) logging.debug(('Cross-validation iteration {} / {} .').format(str(i + 1), str(n_iterations))) timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime()) print(f'\t Time: {timestamp}.') logging.debug(f'\t Time: {timestamp}.') if fixed_seed: random_seed = i**2 else: random_seed = np.random.randint(5000) t = time.time() # Split into test and training set, where the percentage of each # label is maintained if any(clf in regressors for clf in param_grid['classifiers']): # We cannot do a stratified shuffle split with regression classes_temp = classes stratify = None else: if modus == 'singlelabel': classes_temp = stratify = classes.ravel() elif modus == 'multilabel': # Create a stratification object from the labels # Label = 0 means no label equals one # Other label numbers refer to the label name that is 1 stratify = list() for pnum in range(0, len(classes[0])): plabel = 0 for lnum, slabel in enumerate(classes): if slabel[pnum] == 1: plabel = lnum + 1 stratify.append(plabel) # Sklearn multiclass requires rows to be objects/patients classes_temp = np.zeros((classes.shape[1], classes.shape[0])) for n_patient in range(0, classes.shape[1]): for n_label in range(0, classes.shape[0]): classes_temp[n_patient, n_label] = classes[n_label, n_patient] else: raise ae.WORCKeyError('{} is not a valid modus!').format(modus) if fixedsplits is None: # Use Random Split. Split per patient, not per sample unique_patient_ids, unique_indices =\ np.unique(np.asarray(patient_ids), return_index=True) if any(clf in regressors for clf in param_grid['classifiers']): unique_stratify = None else: unique_stratify = [stratify[i] for i in unique_indices] try: unique_PID_train, indices_PID_test\ = train_test_split(unique_patient_ids, test_size=test_size, random_state=random_seed, stratify=unique_stratify) except ValueError as e: e = str(e) + ' Increase the size of your validation set.' raise ae.WORCValueError(e) # Check for all ids if they are in test or training indices_train = list() indices_test = list() patient_ID_train = list() patient_ID_test = list() for num, pid in enumerate(patient_ids): if pid in unique_PID_train: indices_train.append(num) # Make sure we get a unique ID if pid in patient_ID_train: n = 1 while str(pid + '_' + str(n)) in patient_ID_train: n += 1 pid = str(pid + '_' + str(n)) patient_ID_train.append(pid) else: indices_test.append(num) # Make sure we get a unique ID if pid in patient_ID_test: n = 1 while str(pid + '_' + str(n)) in patient_ID_test: n += 1 pid = str(pid + '_' + str(n)) patient_ID_test.append(pid) # Split features and labels accordingly X_train = [image_features[i] for i in indices_train] X_test = [image_features[i] for i in indices_test] if modus == 'singlelabel': Y_train = classes_temp[indices_train] Y_test = classes_temp[indices_test] elif modus == 'multilabel': Y_train = classes_temp[indices_train, :] Y_test = classes_temp[indices_test, :] else: raise ae.WORCKeyError('{} is not a valid modus!').format(modus) else: # Use pre defined splits train = fixedsplits[str(i) + '_train'].dropna().values test = fixedsplits[str(i) + '_test'].dropna().values # Convert the numbers to the correct indices ind_train = list() for j in train: success = False for num, p in enumerate(patient_ids): if j == p: ind_train.append(num) success = True if not success: raise ae.WORCIOError("Patient " + str(j).zfill(3) + " is not included!") ind_test = list() for j in test: success = False for num, p in enumerate(patient_ids): if j == p: ind_test.append(num) success = True if not success: raise ae.WORCIOError("Patient " + str(j).zfill(3) + " is not included!") X_train = [image_features[i] for i in ind_train] X_test = [image_features[i] for i in ind_test] patient_ID_train = patient_ids[ind_train] patient_ID_test = patient_ids[ind_test] if modus == 'singlelabel': Y_train = classes_temp[ind_train] Y_test = classes_temp[ind_test] elif modus == 'multilabel': Y_train = classes_temp[ind_train, :] Y_test = classes_temp[ind_test, :] else: raise ae.WORCKeyError('{} is not a valid modus!').format(modus) # Find best hyperparameters and construct classifier config['HyperOptimization']['use_fastr'] = use_fastr config['HyperOptimization']['fastr_plugin'] = fastr_plugin n_cores = config['General']['Joblib_ncores'] if use_SMAC: trained_classifier = guided_search_parameters(features=X_train, labels=Y_train, parameters=config, n_cores=n_cores, random_seed=random_seed, smac_result_file=smac_result_file, **config['HyperOptimization']) else: trained_classifier = random_search_parameters(features=X_train, labels=Y_train, param_grid=param_grid, n_cores=n_cores, random_seed=random_seed, **config['HyperOptimization']) # We only want to save the feature values and one label array X_train = [x[0] for x in X_train] X_test = [x[0] for x in X_test] temp_save_data = (trained_classifier, X_train, X_test, Y_train, Y_test, patient_ID_train, patient_ID_test, random_seed) save_data.append(temp_save_data) # Test performance for various RS and ensemble sizes if config['General']['DoTestNRSNEns']: output_json = os.path.join(tempfolder, f'performance_RS_Ens_crossval_{i}.json') test_RS_Ensemble(estimator_input=trained_classifier, X_train=X_train, Y_train=Y_train, X_test=X_test, Y_test=Y_test, feature_labels=feature_labels, output_json=output_json) # Save memory delattr(trained_classifier, 'fitted_workflows') trained_classifier.fitted_workflows = list() delattr(trained_classifier, 'fitted_validation_workflows') trained_classifier.fitted_validation_workflows = list() # Create a temporary save if tempsave: panda_labels = ['trained_classifier', 'X_train', 'X_test', 'Y_train', 'Y_test', 'config', 'patient_ID_train', 'patient_ID_test', 'random_seed', 'feature_labels'] panda_data_temp =\ pd.Series([trained_classifier, X_train, X_test, Y_train, Y_test, config, patient_ID_train, patient_ID_test, random_seed, feature_labels], index=panda_labels, name='Constructed crossvalidation') panda_data = pd.DataFrame(panda_data_temp) n = 0 filename = os.path.join(tempfolder, 'tempsave_' + str(i) + '.pkl') while os.path.exists(filename): n += 1 filename = os.path.join(tempfolder, 'tempsave_' + str(i + n) + '.pkl') # panda_data.to_hdf(filename, 'EstimatorData') panda_data.to_pickle(filename) del panda_data, panda_data_temp # Print elapsed time elapsed = int((time.time() - t) / 60.0) print(f'\t Fitting took {elapsed} minutes.') logging.debug(f'\t Fitting took {elapsed} minutes.') return save_data
[docs]def LOO_cross_validation(image_features, feature_labels, classes, patient_ids, param_grid, config, modus, test_size, start=0, save_data=None, tempsave=False, tempfolder=None, fixedsplits=None, fixed_seed=False, use_fastr=None, fastr_plugin=None, use_SMAC=False, smac_result_file=None): """Cross-validation in which each sample is once used as the test set. Mostly based on the default sklearn object. Parameters ------------ Returns ------------ """ print('Starting leave-one-out cross-validation.') logging.debug('Starting leave-one-out cross-validation.') cv = LeaveOneOut() n_splits = cv.get_n_splits(image_features) if save_data is None: # Start from zero, thus empty list of previos data save_data = list() for i, (indices_train, indices_test) in enumerate(cv.split(image_features)): if i < start: continue print(('Cross-validation iteration {} / {} .').format(str(i + 1), str(n_splits))) logging.debug(('Cross-validation iteration {} / {} .').format(str(i + 1), str(n_splits))) timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime()) print(f'\t Time: {timestamp}.') logging.debug(f'\t Time: {timestamp}.') if fixed_seed: random_seed = i**2 else: random_seed = np.random.randint(5000) t = time.time() # Split features and labels accordingly X_train = [image_features[j] for j in indices_train] X_test = [image_features[j] for j in indices_test] patient_ID_train = [patient_ids[j] for j in indices_train] patient_ID_test = [patient_ids[j] for j in indices_test] if modus == 'singlelabel': # Simply use the given class labels classes_temp = classes.ravel() # Split in training and testing Y_train = classes_temp[indices_train] Y_test = classes_temp[indices_test] elif modus == 'multilabel': # Sklearn multiclass requires rows to be objects/patients classes_temp = np.zeros((classes.shape[1], classes.shape[0])) for n_patient in range(0, classes.shape[1]): for n_label in range(0, classes.shape[0]): classes_temp[n_patient, n_label] = classes[n_label, n_patient] # Split in training and testing Y_train = classes_temp[indices_train, :] Y_test = classes_temp[indices_test, :] else: raise ae.WORCKeyError('{} is not a valid modus!').format(modus) # Find best hyperparameters and construct classifier config['HyperOptimization']['use_fastr'] = use_fastr config['HyperOptimization']['fastr_plugin'] = fastr_plugin n_cores = config['General']['Joblib_ncores'] if use_SMAC: trained_classifier = guided_search_parameters(features=X_train, labels=Y_train, parameters=config, n_cores=n_cores, random_seed=random_seed, smac_result_file=smac_result_file, **config['HyperOptimization']) else: trained_classifier = random_search_parameters(features=X_train, labels=Y_train, param_grid=param_grid, n_cores=n_cores, random_seed=random_seed, **config['HyperOptimization']) # We only want to save the feature values and one label array X_train = [x[0] for x in X_train] X_test = [x[0] for x in X_test] temp_save_data = (trained_classifier, X_train, X_test, Y_train, Y_test, patient_ID_train, patient_ID_test, random_seed) save_data.append(temp_save_data) # Create a temporary save if tempsave: panda_labels = ['trained_classifier', 'X_train', 'X_test', 'Y_train', 'Y_test', 'config', 'patient_ID_train', 'patient_ID_test', 'random_seed', 'feature_labels'] panda_data_temp =\ pd.Series([trained_classifier, X_train, X_test, Y_train, Y_test, config, patient_ID_train, patient_ID_test, random_seed, feature_labels], index=panda_labels, name='Constructed crossvalidation') panda_data = pd.DataFrame(panda_data_temp) n = 0 filename = os.path.join(tempfolder, 'tempsave_' + str(i) + '.hdf5') while os.path.exists(filename): n += 1 filename = os.path.join(tempfolder, 'tempsave_' + str(i + n) + '.hdf5') panda_data.to_hdf(filename, 'EstimatorData') del panda_data, panda_data_temp # Print elapsed time elapsed = int((time.time() - t) / 60.0) print(f'\t Fitting took {elapsed} minutes.') logging.debug(f'\t Fitting took {elapsed} minutes.') return save_data
[docs]def crossval(config, label_data, image_features, param_grid=None, use_fastr=False, fastr_plugin=None, tempsave=False, fixedsplits=None, ensemble={'Use': False}, outputfolder=None, modus='singlelabel', use_SMAC=False, smac_result_file=None): """Constructs multiple individual classifiers based on the label settings. Parameters ---------- config: dict, mandatory Dictionary with config settings. See the Github Wiki for the available fields and formatting. label_data: dict, mandatory Should contain the following: patient_ids (list): ids of the patients, used to keep track of test and training sets, and label data label (list): List of lists, where each list contains the label status for that patient for each label label_name (list): Contains the different names that are stored in the label object image_features: numpy array, mandatory Consists of a tuple of two lists for each patient: (feature_values, feature_labels) param_grid: dictionary, optional Contains the parameters and their values wich are used in the grid or randomized search hyperparamater optimization. See the construct_classifier function for some examples. use_fastr: boolean, default False If False, parallel execution through Joblib is used for fast execution of the hyperparameter optimization. Especially suited for execution on mutlicore (H)PC's. The settings used are specified in the config.ini file in the IOparser folder, which you can adjust to your system. If True, fastr is used to split the hyperparameter optimization in separate jobs. Parameters for the splitting can be specified in the config file. Especially suited for clusters. fastr_plugin: string, default None Determines which plugin is used for fastr executions. When None, uses the default plugin from the fastr config. tempsave: boolean, default False If True, create a .hdf5 file after each Cross-validation containing the classifier and results from that that split. This is written to the GSOut folder in your fastr output mount. If False, only the result of all combined Cross-validations will be saved to a .hdf5 file. This will also be done if set to True. fixedsplits: string, optional By default, random split Cross-validation is used to train and evaluate the machine learning methods. Optionally, you can provide a .xlsx file containing fixed splits to be used. See the Github Wiki for the format. ensemble: dictionary, optional Contains the configuration for constructing an ensemble. modus: string, default 'singlelabel' Determine whether one-vs-all classification (or regression) for each single label is used ('singlelabel') or if multilabel classification is performed ('multilabel'). Returns ---------- panda_data: pandas dataframe Contains all information on the trained classifier. """ # Process input data patient_ids = label_data['patient_IDs'] label_value = label_data['label'] label_name = label_data['label_name'] if outputfolder is None: outputfolder = os.getcwd() logfilename = os.path.join(outputfolder, 'classifier.log') print("Logging to file " + str(logfilename)) # Cross-validation iteration to start with start = 0 save_data = list() if tempsave: tempfolder = os.path.join(outputfolder, 'tempsave') if not os.path.exists(tempfolder): # No previous tempsaves os.makedirs(tempfolder) else: # Previous tempsaves, start where we left of tempsaves = glob.glob(os.path.join(tempfolder, 'tempsave_*.pkl')) start = len(tempsaves) # Load previous tempsaves and add to save data tempsaves.sort() for t in tempsaves: t = pd.read_pickle(t) t = t['Constructed crossvalidation'] temp_save_data = (t.trained_classifier, t.X_train, t.X_test, t.Y_train, t.Y_test, t.patient_ID_train, t.patient_ID_test, t.random_seed) save_data.append(temp_save_data) else: tempfolder = None for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=logfilename, level=logging.DEBUG) crossval_type = config['CrossValidation']['Type'] n_iterations = config['CrossValidation']['N_iterations'] test_size = config['CrossValidation']['test_size'] fixed_seed = config['CrossValidation']['fixed_seed'] classifier_labelss = dict() logging.debug('Starting fitting of estimators.') # We only need one label instance, assuming they are all the sample feature_labels = image_features[0][1] # Check if we need to use fixedsplits: if fixedsplits is not None and '.csv' in fixedsplits: fixedsplits = pd.read_csv(fixedsplits, header=0) # Fixedsplits need to be performed in random split fashion, makes no sense for LOO if crossval_type == 'LOO': print('[WORC WARNING] Fixedsplits need to be performed in random split fashion, makes no sense for LOO.') crossval_type = 'random_split' if modus == 'singlelabel': print('Performing single-class classification.') logging.debug('Performing single-class classification.') elif modus == 'multilabel': print('Performing multi-label classification.') logging.debug('Performing multi-class classification.') label_value = [label_value] label_name = [label_name] else: m = ('{} is not a valid modus!').format(modus) logging.debug(m) raise ae.WORCKeyError(m) for i_class, i_name in zip(label_value, label_name): if not tempsave: save_data = list() if crossval_type == 'random_split': print('Performing random-split cross-validations.') logging.debug('Performing random-split cross-validations.') save_data =\ random_split_cross_validation(image_features=image_features, feature_labels=feature_labels, classes=i_class, patient_ids=patient_ids, n_iterations=n_iterations, param_grid=param_grid, config=config, modus=modus, test_size=test_size, start=start, save_data=save_data, tempsave=tempsave, tempfolder=tempfolder, fixedsplits=fixedsplits, fixed_seed=fixed_seed, use_fastr=use_fastr, fastr_plugin=fastr_plugin, use_SMAC=use_SMAC, smac_result_file=smac_result_file) elif crossval_type == 'LOO': print('Performing leave-one-out cross-validations.') logging.debug('Performing leave-one-out cross-validations.') save_data =\ LOO_cross_validation(image_features=image_features, feature_labels=feature_labels, classes=i_class, patient_ids=patient_ids, param_grid=param_grid, config=config, modus=modus, test_size=test_size, start=start, save_data=save_data, tempsave=tempsave, tempfolder=tempfolder, fixedsplits=fixedsplits, fixed_seed=fixed_seed, use_fastr=use_fastr, fastr_plugin=fastr_plugin, use_SMAC=use_SMAC, smac_result_file=smac_result_file) else: raise ae.WORCKeyError(f'{crossval_type} is not a recognized cross-validation type.') [classifiers, X_train_set, X_test_set, Y_train_set, Y_test_set, patient_ID_train_set, patient_ID_test_set, seed_set] =\ zip(*save_data) # Convert to lists classifiers = list(classifiers) X_train_set = list(X_train_set) X_test_set = list(X_test_set) Y_train_set = list(Y_train_set) Y_test_set = list(Y_test_set) patient_ID_train_set = list(patient_ID_train_set) patient_ID_test_set = list(patient_ID_test_set) seed_set = list(seed_set) panda_labels = ['classifiers', 'X_train', 'X_test', 'Y_train', 'Y_test', 'config', 'patient_ID_train', 'patient_ID_test', 'random_seed', 'feature_labels'] panda_data_temp =\ pd.Series([classifiers, X_train_set, X_test_set, Y_train_set, Y_test_set, config, patient_ID_train_set, patient_ID_test_set, seed_set, feature_labels], index=panda_labels, name='Constructed crossvalidation') if modus == 'singlelabel': i_name = ''.join(i_name) elif modus == 'multilabel': i_name = ','.join(i_name) classifier_labelss[i_name] = panda_data_temp panda_data = pd.DataFrame(classifier_labelss) return panda_data
[docs]def nocrossval(config, label_data_train, label_data_test, image_features_train, image_features_test, param_grid=None, use_fastr=False, fastr_plugin=None, ensemble={'Use': False}, modus='singlelabel'): """Constructs multiple individual classifiers based on the label settings. Arguments: config (Dict): Dictionary with config settings label_data (Dict): should contain: patient_ids (list): ids of the patients, used to keep track of test and training sets, and label data label (list): List of lists, where each list contains the label status for that patient for each label label_name (list): Contains the different names that are stored in the label object image_features (numpy array): Consists of a tuple of two lists for each patient: (feature_values, feature_labels) ensemble: dictionary, optional Contains the configuration for constructing an ensemble. modus: string, default 'singlelabel' Determine whether one-vs-all classification (or regression) for each single label is used ('singlelabel') or if multilabel classification is performed ('multilabel'). Returns: classifier_data (pandas dataframe) """ patient_ids_train = label_data_train['patient_IDs'] label_value_train = label_data_train['label'] label_name_train = label_data_train['label_name'] patient_ids_test = label_data_test['patient_IDs'] if 'label' in label_data_test.keys(): label_value_test = label_data_test['label'] else: label_value_test = [None] * len(patient_ids_test) logfilename = os.path.join(os.getcwd(), 'classifier.log') logging.basicConfig(filename=logfilename, level=logging.DEBUG) classifier_labelss = dict() logging.debug('Starting classifier') # Determine modus if modus == 'singlelabel': print('Performing Single class classification.') logging.debug('Performing Single class classification.') elif modus == 'multilabel': print('Performing Multi label classification.') logging.debug('Performing Multi class classification.') label_name_train = [label_name_train] else: m = ('{} is not a valid modus!').format(modus) logging.debug(m) raise ae.WORCKeyError(m) # We only need one label instance, assuming they are all the sample feature_labels = image_features_train[0][1] for i_name in label_name_train: save_data = list() random_seed = np.random.randint(5000) # Split into test and training set, where the percentage of each # label is maintained X_train = image_features_train X_test = image_features_test if modus == 'singlelabel': Y_train = label_value_train.ravel() Y_test = label_value_test.ravel() else: # Sklearn multiclass requires rows to be objects/patients Y_train = label_value_train Y_train_temp = np.zeros((Y_train.shape[1], Y_train.shape[0])) for n_patient in range(0, Y_train.shape[1]): for n_label in range(0, Y_train.shape[0]): Y_train_temp[n_patient, n_label] = Y_train[n_label, n_patient] Y_train = Y_train_temp Y_test = label_value_test Y_test_temp = np.zeros((Y_test.shape[1], Y_test.shape[0])) for n_patient in range(0, Y_test.shape[1]): for n_label in range(0, Y_test.shape[0]): Y_test_temp[n_patient, n_label] = Y_test[n_label, n_patient] Y_test = Y_test_temp # Find best hyperparameters and construct classifier config['HyperOptimization']['use_fastr'] = use_fastr config['HyperOptimization']['fastr_plugin'] = fastr_plugin n_cores = config['General']['Joblib_ncores'] trained_classifier =\ random_search_parameters(features=X_train, labels=Y_train, param_grid=param_grid, n_cores=n_cores, **config['HyperOptimization']) # Create an ensemble if required # NOTE: removed to keep memory and storage usage low # trained_classifier.create_ensemble(X_train, Y_train, method=ensemble['Use']) # Extract the feature values X_train = np.asarray([x[0] for x in X_train]) X_test = np.asarray([x[0] for x in X_test]) temp_save_data = (trained_classifier, X_train, X_test, Y_train, Y_test, patient_ids_train, patient_ids_test, random_seed) save_data.append(temp_save_data) [classifiers, X_train_set, X_test_set, Y_train_set, Y_test_set, patient_ID_train_set, patient_ID_test_set, seed_set] =\ zip(*save_data) panda_labels = ['classifiers', 'X_train', 'X_test', 'Y_train', 'Y_test', 'config', 'patient_ID_train', 'patient_ID_test', 'random_seed', 'feature_labels'] panda_data_temp =\ pd.Series([classifiers, X_train_set, X_test_set, Y_train_set, Y_test_set, config, patient_ID_train_set, patient_ID_test_set, seed_set, feature_labels], index=panda_labels, name='Constructed crossvalidation') i_name = ''.join(i_name) classifier_labelss[i_name] = panda_data_temp # Test performance for various RS and ensemble sizes if config['General']['DoTestNRSNEns']: # FIXME: Use home folder, as this function does not know # Where final or temporary output is located output_json = os.path.join(os.path.expanduser("~"), f'performance_RS_Ens.json') test_RS_Ensemble(estimator_input=trained_classifier, X_train=X_train, Y_train=Y_train, X_test=X_test, Y_test=Y_test, feature_labels=feature_labels, output_json=output_json) # Save memory delattr(trained_classifier, 'fitted_workflows') trained_classifier.fitted_workflows = list() delattr(trained_classifier, 'fitted_validation_workflows') trained_classifier.fitted_validation_workflows = list() panda_data = pd.DataFrame(classifier_labelss) return panda_data
[docs]def test_RS_Ensemble(estimator_input, X_train, Y_train, X_test, Y_test, feature_labels, output_json, verbose=False, RSs=None, ensembles=None, maxlen=100): """Test performance for different random search and ensemble sizes. This function is written for conducting a specific experiment from the WORC paper to test how the performance varies with varying random search and ensemble sizes. We do not recommend usage in general of this part. maxlen = 100 # max ensembles numeric """ # Process some input estimator_original = copy.deepcopy(estimator_input) X_train_temp = [(x, feature_labels) for x in X_train] n_workflows = len(estimator_original.cv_results_['mean_test_score']) # Settings if RSs is None: RSs = [10, 100, 1000, 10000] * 10 + [n_workflows] if ensembles is None: ensembles = [1, 10, 100, 'FitNumber', 'Bagging'] # Loop over the random searches and ensembles keys = list() performances = dict() for RS in RSs: if RS <= n_workflows: # Make a key for saving the score num = 0 key = f'RS {RS} try {str(num).zfill(2)}' while key in keys: num += 1 key = f'RS {RS} try {str(num).zfill(2)}' keys.append(key) # Make a local copy of the estimator and select only subset of workflows print(f'\t Using RS {RS}.') estimator = copy.deepcopy(estimator_original) # estimator.maxlen = RS # Why is this needed? This will only lead to a lot of extra workflows on top of the top 100 being fitted estimator.maxlen = min(RS, maxlen) workflow_num = np.arange(n_workflows).tolist() # Select only a specific set of workflows random.shuffle(workflow_num) selected_workflows = workflow_num[0:RS] # Get the mean performances and get new ranking F1_validation = estimator.cv_results_['mean_test_score'] F1_validation = [F1_validation[i] for i in selected_workflows] workflow_ranking = np.argsort(np.asarray(F1_validation)).tolist()[::-1] # Normally, rank from smallest to largest, so reverse workflow_ranking = workflow_ranking[0:maxlen] # only maxlen estimators needed for ensembling tests F1_validation = [F1_validation[i] for i in workflow_ranking] # Only keep the number of RS required and resort based on ranking if estimator.fitted_workflows: estimator.fitted_workflows =\ [estimator.fitted_workflows[i] for i in selected_workflows] estimator.fitted_workflows =\ [estimator.fitted_workflows[i] for i in workflow_ranking] # For advanced ensembling methods, keep only the parameters of the selected RS workflows estimator.cv_results_['params'] =\ [estimator.cv_results_['params'][i] for i in selected_workflows] estimator.cv_results_['params'] =\ [estimator.cv_results_['params'][i] for i in workflow_ranking] # Refit validation estimators if required if not estimator.fitted_validation_workflows and estimator.refit_validation_workflows: print('\t Refit all validation workflows so we dont have to do this for every ensembling method.') # Define function to fit a single estimator def fitvalidationestimator(parameters, train, test): new_estimator = RandomizedSearchCVfastr() new_estimator.refit_and_score(X_train_temp, Y_train, parameters, train=train, test=test) return new_estimator # Use joblib to parallelize fitting estimators =\ Parallel(n_jobs=-1)(delayed(fitvalidationestimator)( parameters, train, test) for parameters in estimator.cv_results_['params'] for train, test in estimator.cv_iter) estimator.fitted_validation_workflows = estimators elif estimator.fitted_validation_workflows: # Select the required already fitted validation workflows selected_workflows_ranked_all = list() for j in range(len(estimator.cv_iter)): selected_workflows_ranked = [i + n_workflows * j for i in selected_workflows] selected_workflows_ranked = [selected_workflows_ranked[i] for i in workflow_ranking] selected_workflows_ranked_all.extend(selected_workflows_ranked) estimator.fitted_validation_workflows =\ [estimator.fitted_validation_workflows[i] for i in selected_workflows_ranked_all] # Store train and validation AUC F1_training = estimator.cv_results_['mean_train_score'] F1_training = [F1_training[i] for i in selected_workflows] F1_training = [F1_training[i] for i in workflow_ranking] performances[f'Mean training F1-score {key} top {maxlen}'] = F1_validation performances[f'Mean validation F1-score {key} top {maxlen}'] = F1_training for ensemble in ensembles: if isinstance(ensemble, int): if ensemble > RS: continue else: print(f'\t Using ensemble {ensemble}.') # Create the ensemble estimator.create_ensemble(X_train_temp, Y_train, method='top_N', size=ensemble, verbose=verbose) else: print(f'\t Using ensemble {ensemble}.') # Create the ensemble estimator.create_ensemble(X_train_temp, Y_train, method=ensemble, verbose=verbose) performances[f'Validation F1-score Ensemble {ensemble} {key}'] = estimator.ensemble_validation_score # Compute performance y_prediction = estimator.predict(X_test) y_score = estimator.predict_proba(X_test)[:, 1] auc = roc_auc_score(Y_test, y_score) f1_score_out = f1_score(Y_test, y_prediction, average='weighted') performances[f'Test F1-score Ensemble {ensemble} {key}'] = f1_score_out performances[f'Test AUC Ensemble {ensemble} {key}'] = auc y_prediction = estimator.predict(X_train) y_score = estimator.predict_proba(X_train)[:, 1] auc = roc_auc_score(Y_train, y_score) f1_score_out = f1_score(Y_train, y_prediction, average='weighted') performances[f'Train F1-score Ensemble {ensemble} {key}'] = f1_score_out performances[f'Train AUC Ensemble {ensemble} {key}'] = auc # Write output with open(output_json, 'w') as fp: json.dump(performances, fp, sort_keys=True, indent=4)