#!/usr/bin/env python
# Copyright 2016-2021 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import pandas as pd
import logging
import os
import time
from time import gmtime, strftime
from sklearn.model_selection import train_test_split, LeaveOneOut
from .parameter_optimization import random_search_parameters
import WORC.addexceptions as ae
from WORC.classification.regressors import regressors
import glob
import random
import json
from copy import copy
from sklearn.metrics import f1_score, roc_auc_score
[docs]def random_split_cross_validation(image_features, feature_labels, classes,
patient_ids,
n_iterations, param_grid, config,
modus, test_size, start=0, save_data=None,
tempsave=False, tempfolder=None,
fixedsplits=None,
fixed_seed=False, use_fastr=None,
fastr_plugin=None,
do_test_RS_Ensemble=False):
"""Cross-validation in which data is randomly split in each iteration.
Due to options of doing single-label and multi-label classification,
stratified splitting, and regression, we use a manual loop instead
of the default scikit-learn object.
Parameters
------------
Returns
------------
"""
print('Starting random-split cross-validation.')
logging.debug('Starting random-split cross-validation.')
if save_data is None:
# Start from zero, thus empty list of previos data
save_data = list()
# If we are using fixed splits, set the n_iterations to the number of splits
if fixedsplits is not None:
n_iterations = int(fixedsplits.columns.shape[0] / 2)
print(f'Fixedsplits detected, adjusting n_iterations to {n_iterations}')
for i in range(start, n_iterations):
print(('Cross-validation iteration {} / {} .').format(str(i + 1), str(n_iterations)))
logging.debug(('Cross-validation iteration {} / {} .').format(str(i + 1), str(n_iterations)))
timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
print(f'\t Time: {timestamp}.')
logging.debug(f'\t Time: {timestamp}.')
if fixed_seed:
random_seed = i**2
else:
random_seed = np.random.randint(5000)
t = time.time()
# Split into test and training set, where the percentage of each
# label is maintained
if any(clf in regressors for clf in param_grid['classifiers']):
# We cannot do a stratified shuffle split with regression
classes_temp = classes
stratify = None
else:
if modus == 'singlelabel':
classes_temp = stratify = classes.ravel()
elif modus == 'multilabel':
# Create a stratification object from the labels
# Label = 0 means no label equals one
# Other label numbers refer to the label name that is 1
stratify = list()
for pnum in range(0, len(classes[0])):
plabel = 0
for lnum, slabel in enumerate(classes):
if slabel[pnum] == 1:
plabel = lnum + 1
stratify.append(plabel)
# Sklearn multiclass requires rows to be objects/patients
classes_temp = np.zeros((classes.shape[1], classes.shape[0]))
for n_patient in range(0, classes.shape[1]):
for n_label in range(0, classes.shape[0]):
classes_temp[n_patient, n_label] = classes[n_label, n_patient]
else:
raise ae.WORCKeyError('{} is not a valid modus!').format(modus)
if fixedsplits is None:
# Use Random Split. Split per patient, not per sample
unique_patient_ids, unique_indices =\
np.unique(np.asarray(patient_ids), return_index=True)
if any(clf in regressors for clf in param_grid['classifiers']):
unique_stratify = None
else:
unique_stratify = [stratify[i] for i in unique_indices]
try:
unique_PID_train, indices_PID_test\
= train_test_split(unique_patient_ids,
test_size=test_size,
random_state=random_seed,
stratify=unique_stratify)
except ValueError as e:
e = str(e) + ' Increase the size of your validation set.'
raise ae.WORCValueError(e)
# Check for all ids if they are in test or training
indices_train = list()
indices_test = list()
patient_ID_train = list()
patient_ID_test = list()
for num, pid in enumerate(patient_ids):
if pid in unique_PID_train:
indices_train.append(num)
# Make sure we get a unique ID
if pid in patient_ID_train:
n = 1
while str(pid + '_' + str(n)) in patient_ID_train:
n += 1
pid = str(pid + '_' + str(n))
patient_ID_train.append(pid)
else:
indices_test.append(num)
# Make sure we get a unique ID
if pid in patient_ID_test:
n = 1
while str(pid + '_' + str(n)) in patient_ID_test:
n += 1
pid = str(pid + '_' + str(n))
patient_ID_test.append(pid)
# Split features and labels accordingly
X_train = [image_features[i] for i in indices_train]
X_test = [image_features[i] for i in indices_test]
if modus == 'singlelabel':
Y_train = classes_temp[indices_train]
Y_test = classes_temp[indices_test]
elif modus == 'multilabel':
Y_train = classes_temp[indices_train, :]
Y_test = classes_temp[indices_test, :]
else:
raise ae.WORCKeyError('{} is not a valid modus!').format(modus)
else:
# Use pre defined splits
train = fixedsplits[str(i) + '_train'].dropna().values
test = fixedsplits[str(i) + '_test'].dropna().values
# Convert the numbers to the correct indices
ind_train = list()
for j in train:
success = False
for num, p in enumerate(patient_ids):
if j == p:
ind_train.append(num)
success = True
if not success:
raise ae.WORCIOError("Patient " + str(j).zfill(3) + " is not included!")
ind_test = list()
for j in test:
success = False
for num, p in enumerate(patient_ids):
if j == p:
ind_test.append(num)
success = True
if not success:
raise ae.WORCIOError("Patient " + str(j).zfill(3) + " is not included!")
X_train = [image_features[i] for i in ind_train]
X_test = [image_features[i] for i in ind_test]
patient_ID_train = patient_ids[ind_train]
patient_ID_test = patient_ids[ind_test]
if modus == 'singlelabel':
Y_train = classes_temp[ind_train]
Y_test = classes_temp[ind_test]
elif modus == 'multilabel':
Y_train = classes_temp[ind_train, :]
Y_test = classes_temp[ind_test, :]
else:
raise ae.WORCKeyError('{} is not a valid modus!').format(modus)
# Find best hyperparameters and construct classifier
config['HyperOptimization']['use_fastr'] = use_fastr
config['HyperOptimization']['fastr_plugin'] = fastr_plugin
n_cores = config['General']['Joblib_ncores']
trained_classifier = random_search_parameters(features=X_train,
labels=Y_train,
param_grid=param_grid,
n_cores=n_cores,
random_seed=random_seed,
**config['HyperOptimization'])
# We only want to save the feature values and one label array
X_train = [x[0] for x in X_train]
X_test = [x[0] for x in X_test]
temp_save_data = (trained_classifier, X_train, X_test, Y_train,
Y_test, patient_ID_train, patient_ID_test, random_seed)
save_data.append(temp_save_data)
# Test performance for various RS and ensemble sizes
if do_test_RS_Ensemble:
output_json = os.path.join(tempfolder, f'performance_RS_Ens_crossval_{i}.json')
test_RS_Ensemble(estimator_input=trained_classifier,
X_train=X_train, Y_train=Y_train,
X_test=X_test, Y_test=Y_test,
feature_labels=feature_labels,
output_json=output_json)
# Save memory
delattr(trained_classifier, 'fitted_workflows')
trained_classifier.fitted_workflows = list()
# Create a temporary save
if tempsave:
panda_labels = ['trained_classifier', 'X_train', 'X_test',
'Y_train', 'Y_test',
'config', 'patient_ID_train', 'patient_ID_test',
'random_seed', 'feature_labels']
panda_data_temp =\
pd.Series([trained_classifier, X_train, X_test, Y_train,
Y_test, config, patient_ID_train,
patient_ID_test, random_seed, feature_labels],
index=panda_labels,
name='Constructed crossvalidation')
panda_data = pd.DataFrame(panda_data_temp)
n = 0
filename = os.path.join(tempfolder, 'tempsave_' + str(i) + '.hdf5')
while os.path.exists(filename):
n += 1
filename = os.path.join(tempfolder, 'tempsave_' + str(i + n) + '.hdf5')
panda_data.to_hdf(filename, 'EstimatorData')
del panda_data, panda_data_temp
# Print elapsed time
elapsed = int((time.time() - t) / 60.0)
print(f'\t Fitting took {elapsed} minutes.')
logging.debug(f'\t Fitting took {elapsed} minutes.')
return save_data
[docs]def LOO_cross_validation(image_features, feature_labels, classes, patient_ids,
param_grid, config,
modus, test_size, start=0, save_data=None,
tempsave=False, tempfolder=None, fixedsplits=None,
fixed_seed=False, use_fastr=None,
fastr_plugin=None):
"""Cross-validation in which each sample is once used as the test set.
Mostly based on the default sklearn object.
Parameters
------------
Returns
------------
"""
print('Starting leave-one-out cross-validation.')
logging.debug('Starting leave-one-out cross-validation.')
cv = LeaveOneOut()
n_splits = cv.get_n_splits(image_features)
if save_data is None:
# Start from zero, thus empty list of previos data
save_data = list()
for i, (indices_train, indices_test) in enumerate(cv.split(image_features)):
if i < start:
continue
print(('Cross-validation iteration {} / {} .').format(str(i + 1), str(n_splits)))
logging.debug(('Cross-validation iteration {} / {} .').format(str(i + 1), str(n_splits)))
timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
print(f'\t Time: {timestamp}.')
logging.debug(f'\t Time: {timestamp}.')
if fixed_seed:
random_seed = i**2
else:
random_seed = np.random.randint(5000)
t = time.time()
# Split features and labels accordingly
X_train = [image_features[j] for j in indices_train]
X_test = [image_features[j] for j in indices_test]
patient_ID_train = [patient_ids[j] for j in indices_train]
patient_ID_test = [patient_ids[j] for j in indices_test]
if modus == 'singlelabel':
# Simply use the given class labels
classes_temp = classes.ravel()
# Split in training and testing
Y_train = classes_temp[indices_train]
Y_test = classes_temp[indices_test]
elif modus == 'multilabel':
# Sklearn multiclass requires rows to be objects/patients
classes_temp = np.zeros((classes.shape[1], classes.shape[0]))
for n_patient in range(0, classes.shape[1]):
for n_label in range(0, classes.shape[0]):
classes_temp[n_patient, n_label] = classes[n_label, n_patient]
# Split in training and testing
Y_train = classes_temp[indices_train, :]
Y_test = classes_temp[indices_test, :]
else:
raise ae.WORCKeyError('{} is not a valid modus!').format(modus)
# Find best hyperparameters and construct classifier
config['HyperOptimization']['use_fastr'] = use_fastr
config['HyperOptimization']['fastr_plugin'] = fastr_plugin
n_cores = config['General']['Joblib_ncores']
trained_classifier = random_search_parameters(features=X_train,
labels=Y_train,
param_grid=param_grid,
n_cores=n_cores,
random_seed=random_seed,
**config['HyperOptimization'])
# We only want to save the feature values and one label array
X_train = [x[0] for x in X_train]
X_test = [x[0] for x in X_test]
temp_save_data = (trained_classifier, X_train, X_test, Y_train,
Y_test, patient_ID_train, patient_ID_test, random_seed)
save_data.append(temp_save_data)
# Create a temporary save
if tempsave:
panda_labels = ['trained_classifier', 'X_train', 'X_test',
'Y_train', 'Y_test',
'config', 'patient_ID_train', 'patient_ID_test',
'random_seed', 'feature_labels']
panda_data_temp =\
pd.Series([trained_classifier, X_train, X_test, Y_train,
Y_test, config, patient_ID_train,
patient_ID_test, random_seed, feature_labels],
index=panda_labels,
name='Constructed crossvalidation')
panda_data = pd.DataFrame(panda_data_temp)
n = 0
filename = os.path.join(tempfolder, 'tempsave_' + str(i) + '.hdf5')
while os.path.exists(filename):
n += 1
filename = os.path.join(tempfolder, 'tempsave_' + str(i + n) + '.hdf5')
panda_data.to_hdf(filename, 'EstimatorData')
del panda_data, panda_data_temp
# Print elapsed time
elapsed = int((time.time() - t) / 60.0)
print(f'\t Fitting took {elapsed} minutes.')
logging.debug(f'\t Fitting took {elapsed} minutes.')
return save_data
[docs]def crossval(config, label_data, image_features,
param_grid=None, use_fastr=False,
fastr_plugin=None, tempsave=False,
fixedsplits=None, ensemble={'Use': False}, outputfolder=None,
modus='singlelabel'):
"""Constructs multiple individual classifiers based on the label settings.
Parameters
----------
config: dict, mandatory
Dictionary with config settings. See the Github Wiki for the
available fields and formatting.
label_data: dict, mandatory
Should contain the following:
patient_ids (list): ids of the patients, used to keep track of test and
training sets, and label data
label (list): List of lists, where each list contains the
label status for that patient for each
label
label_name (list): Contains the different names that are stored
in the label object
image_features: numpy array, mandatory
Consists of a tuple of two lists for each patient:
(feature_values, feature_labels)
param_grid: dictionary, optional
Contains the parameters and their values wich are used in the
grid or randomized search hyperparamater optimization. See the
construct_classifier function for some examples.
use_fastr: boolean, default False
If False, parallel execution through Joblib is used for fast
execution of the hyperparameter optimization. Especially suited
for execution on mutlicore (H)PC's. The settings used are
specified in the config.ini file in the IOparser folder, which you
can adjust to your system.
If True, fastr is used to split the hyperparameter optimization in
separate jobs. Parameters for the splitting can be specified in the
config file. Especially suited for clusters.
fastr_plugin: string, default None
Determines which plugin is used for fastr executions.
When None, uses the default plugin from the fastr config.
tempsave: boolean, default False
If True, create a .hdf5 file after each Cross-validation containing
the classifier and results from that that split. This is written to
the GSOut folder in your fastr output mount. If False, only
the result of all combined Cross-validations will be saved to a .hdf5
file. This will also be done if set to True.
fixedsplits: string, optional
By default, random split Cross-validation is used to train and
evaluate the machine learning methods. Optionally, you can provide
a .xlsx file containing fixed splits to be used. See the Github Wiki
for the format.
ensemble: dictionary, optional
Contains the configuration for constructing an ensemble.
modus: string, default 'singlelabel'
Determine whether one-vs-all classification (or regression) for
each single label is used ('singlelabel') or if multilabel
classification is performed ('multilabel').
Returns
----------
panda_data: pandas dataframe
Contains all information on the trained classifier.
"""
# Process input data
patient_ids = label_data['patient_IDs']
label_value = label_data['label']
label_name = label_data['label_name']
if outputfolder is None:
outputfolder = os.getcwd()
logfilename = os.path.join(outputfolder, 'classifier.log')
print("Logging to file " + str(logfilename))
# Cross-validation iteration to start with
start = 0
save_data = list()
if tempsave:
tempfolder = os.path.join(outputfolder, 'tempsave')
if not os.path.exists(tempfolder):
# No previous tempsaves
os.makedirs(tempfolder)
else:
# Previous tempsaves, start where we left of
tempsaves = glob.glob(os.path.join(tempfolder, 'tempsave_*.hdf5'))
start = len(tempsaves)
# Load previous tempsaves and add to save data
tempsaves.sort()
for t in tempsaves:
t = pd.read_hdf(t)
t = t['Constructed crossvalidation']
temp_save_data = (t.trained_classifier, t.X_train, t.X_test,
t.Y_train, t.Y_test, t.patient_ID_train,
t.patient_ID_test, t.random_seed)
save_data.append(temp_save_data)
else:
tempfolder = None
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
logging.basicConfig(filename=logfilename, level=logging.DEBUG)
crossval_type = config['CrossValidation']['Type']
n_iterations = config['CrossValidation']['N_iterations']
test_size = config['CrossValidation']['test_size']
fixed_seed = config['CrossValidation']['fixed_seed']
classifier_labelss = dict()
logging.debug('Starting fitting of estimators.')
# We only need one label instance, assuming they are all the sample
feature_labels = image_features[0][1]
# Check if we need to use fixedsplits:
if fixedsplits is not None and '.csv' in fixedsplits:
fixedsplits = pd.read_csv(fixedsplits, header=0)
# Fixedsplits need to be performed in random split fashion, makes no sense for LOO
if crossval_type == 'LOO':
print('[WORC WARNING] Fixedsplits need to be performed in random split fashion, makes no sense for LOO.')
crossval_type = 'random_split'
if modus == 'singlelabel':
print('Performing single-class classification.')
logging.debug('Performing single-class classification.')
elif modus == 'multilabel':
print('Performing multi-label classification.')
logging.debug('Performing multi-class classification.')
label_value = [label_value]
label_name = [label_name]
else:
m = ('{} is not a valid modus!').format(modus)
logging.debug(m)
raise ae.WORCKeyError(m)
for i_class, i_name in zip(label_value, label_name):
if not tempsave:
save_data = list()
if crossval_type == 'random_split':
print('Performing random-split cross-validations.')
logging.debug('Performing random-split cross-validations.')
save_data =\
random_split_cross_validation(image_features=image_features,
feature_labels=feature_labels,
classes=i_class,
patient_ids=patient_ids,
n_iterations=n_iterations,
param_grid=param_grid,
config=config,
modus=modus,
test_size=test_size,
start=start,
save_data=save_data,
tempsave=tempsave,
tempfolder=tempfolder,
fixedsplits=fixedsplits,
fixed_seed=fixed_seed,
use_fastr=use_fastr,
fastr_plugin=fastr_plugin)
elif crossval_type == 'LOO':
print('Performing leave-one-out cross-validations.')
logging.debug('Performing leave-one-out cross-validations.')
save_data =\
LOO_cross_validation(image_features=image_features,
feature_labels=feature_labels,
classes=i_class,
patient_ids=patient_ids,
param_grid=param_grid,
config=config,
modus=modus,
test_size=test_size,
start=start,
save_data=save_data,
tempsave=tempsave,
tempfolder=tempfolder,
fixedsplits=fixedsplits,
fixed_seed=fixed_seed,
use_fastr=use_fastr,
fastr_plugin=fastr_plugin)
else:
raise ae.WORCKeyError(f'{crossval_type} is not a recognized cross-validation type.')
[classifiers, X_train_set, X_test_set, Y_train_set, Y_test_set,
patient_ID_train_set, patient_ID_test_set, seed_set] =\
zip(*save_data)
# Convert to lists
classifiers = list(classifiers)
X_train_set = list(X_train_set)
X_test_set = list(X_test_set)
Y_train_set = list(Y_train_set)
Y_test_set = list(Y_test_set)
patient_ID_train_set = list(patient_ID_train_set)
patient_ID_test_set = list(patient_ID_test_set)
seed_set = list(seed_set)
panda_labels = ['classifiers', 'X_train', 'X_test', 'Y_train', 'Y_test',
'config', 'patient_ID_train', 'patient_ID_test',
'random_seed', 'feature_labels']
panda_data_temp =\
pd.Series([classifiers, X_train_set, X_test_set, Y_train_set,
Y_test_set, config, patient_ID_train_set,
patient_ID_test_set, seed_set, feature_labels],
index=panda_labels,
name='Constructed crossvalidation')
if modus == 'singlelabel':
i_name = ''.join(i_name)
elif modus == 'multilabel':
i_name = ','.join(i_name)
classifier_labelss[i_name] = panda_data_temp
panda_data = pd.DataFrame(classifier_labelss)
return panda_data
[docs]def nocrossval(config, label_data_train, label_data_test, image_features_train,
image_features_test, param_grid=None, use_fastr=False,
fastr_plugin=None, ensemble={'Use': False},
modus='singlelabel', do_test_RS_Ensemble=False):
"""Constructs multiple individual classifiers based on the label settings.
Arguments:
config (Dict): Dictionary with config settings
label_data (Dict): should contain:
patient_ids (list): ids of the patients, used to keep track of test and
training sets, and label data
label (list): List of lists, where each list contains the
label status for that patient for each
label
label_name (list): Contains the different names that are stored
in the label object
image_features (numpy array): Consists of a tuple of two lists for each patient:
(feature_values, feature_labels)
ensemble: dictionary, optional
Contains the configuration for constructing an ensemble.
modus: string, default 'singlelabel'
Determine whether one-vs-all classification (or regression) for
each single label is used ('singlelabel') or if multilabel
classification is performed ('multilabel').
Returns:
classifier_data (pandas dataframe)
"""
patient_ids_train = label_data_train['patient_IDs']
label_value_train = label_data_train['label']
label_name_train = label_data_train['label_name']
patient_ids_test = label_data_test['patient_IDs']
if 'label' in label_data_test.keys():
label_value_test = label_data_test['label']
else:
label_value_test = [None] * len(patient_ids_test)
logfilename = os.path.join(os.getcwd(), 'classifier.log')
logging.basicConfig(filename=logfilename, level=logging.DEBUG)
classifier_labelss = dict()
logging.debug('Starting classifier')
# Determine modus
if modus == 'singlelabel':
print('Performing Single class classification.')
logging.debug('Performing Single class classification.')
elif modus == 'multilabel':
print('Performing Multi label classification.')
logging.debug('Performing Multi class classification.')
label_name_train = [label_name_train]
else:
m = ('{} is not a valid modus!').format(modus)
logging.debug(m)
raise ae.WORCKeyError(m)
# We only need one label instance, assuming they are all the sample
feature_labels = image_features_train[0][1]
for i_name in label_name_train:
save_data = list()
random_seed = np.random.randint(5000)
# Split into test and training set, where the percentage of each
# label is maintained
X_train = image_features_train
X_test = image_features_test
if modus == 'singlelabel':
Y_train = label_value_train.ravel()
Y_test = label_value_test.ravel()
else:
# Sklearn multiclass requires rows to be objects/patients
Y_train = label_value_train
Y_train_temp = np.zeros((Y_train.shape[1], Y_train.shape[0]))
for n_patient in range(0, Y_train.shape[1]):
for n_label in range(0, Y_train.shape[0]):
Y_train_temp[n_patient, n_label] = Y_train[n_label, n_patient]
Y_train = Y_train_temp
Y_test = label_value_test
Y_test_temp = np.zeros((Y_test.shape[1], Y_test.shape[0]))
for n_patient in range(0, Y_test.shape[1]):
for n_label in range(0, Y_test.shape[0]):
Y_test_temp[n_patient, n_label] = Y_test[n_label, n_patient]
Y_test = Y_test_temp
# Find best hyperparameters and construct classifier
config['HyperOptimization']['use_fastr'] = use_fastr
config['HyperOptimization']['fastr_plugin'] = fastr_plugin
n_cores = config['General']['Joblib_ncores']
trained_classifier =\
random_search_parameters(features=X_train,
labels=Y_train,
param_grid=param_grid,
n_cores=n_cores,
**config['HyperOptimization'])
# Create an ensemble if required
# NOTE: removed to keep memory and storage usage low
# trained_classifier.create_ensemble(X_train, Y_train, method=ensemble['Use'])
# Extract the feature values
X_train = np.asarray([x[0] for x in X_train])
X_test = np.asarray([x[0] for x in X_test])
temp_save_data = (trained_classifier, X_train, X_test, Y_train,
Y_test, patient_ids_train, patient_ids_test, random_seed)
save_data.append(temp_save_data)
[classifiers, X_train_set, X_test_set, Y_train_set, Y_test_set,
patient_ID_train_set, patient_ID_test_set, seed_set] =\
zip(*save_data)
panda_labels = ['classifiers', 'X_train', 'X_test', 'Y_train', 'Y_test',
'config', 'patient_ID_train', 'patient_ID_test',
'random_seed', 'feature_labels']
panda_data_temp =\
pd.Series([classifiers, X_train_set, X_test_set, Y_train_set,
Y_test_set, config, patient_ID_train_set,
patient_ID_test_set, seed_set, feature_labels],
index=panda_labels,
name='Constructed crossvalidation')
i_name = ''.join(i_name)
classifier_labelss[i_name] = panda_data_temp
# Test performance for various RS and ensemble sizes
if do_test_RS_Ensemble:
# FIXME: Use home folder, as this function does not know
# Where final or temporary output is located
output_json = os.path.join(os.path.expanduser("~"),
f'performance_RS_Ens.json')
test_RS_Ensemble(estimator_input=trained_classifier,
X_train=X_train, Y_train=Y_train,
X_test=X_test, Y_test=Y_test,
feature_labels=feature_labels,
output_json=output_json)
# Save memory
delattr(trained_classifier, 'fitted_workflows')
trained_classifier.fitted_workflows = list()
panda_data = pd.DataFrame(classifier_labelss)
return panda_data
[docs]def test_RS_Ensemble(estimator_input, X_train, Y_train, X_test, Y_test,
feature_labels, output_json):
"""Test performance for different random search and ensemble sizes.
This function is written for conducting a specific experiment from the
WORC paper to test how the performance varies with varying random search
and ensemble sizes. We do not recommend usage in general of this part.
"""
# Process some input
estimator_original = copy(estimator_input)
X_train_temp = [(x, feature_labels) for x in X_train]
n_workflows = len(estimator_original.fitted_workflows)
# Settings
RSs = [10, 50, 100, 1000, 10000] * 10 + [n_workflows]
ensembles = [1, 10, 50, 100]
maxlen = max(ensembles)
# Loop over the random searches and ensembles
keys = list()
performances = dict()
for RS in RSs:
if RS <= n_workflows:
# Make a key for saving the score
num = 0
key = f'RS {RS} try {str(num).zfill(2)}'
while key in keys:
num += 1
key = f'RS {RS} try {str(num).zfill(2)}'
keys.append(key)
# Make a local copy of the estimator and select only subset of workflows
print(f'\t Using RS {RS}.')
estimator = copy(estimator_original)
workflow_num = np.arange(n_workflows).tolist()
# Select only a specific set of workflows
random.shuffle(workflow_num)
selected_workflows = workflow_num[0:RS]
# Get the mean performances and get new ranking
F1_validation = estimator.cv_results_['mean_test_score']
F1_validation = [F1_validation[i] for i in selected_workflows]
workflow_ranking = np.argsort(np.asarray(F1_validation)).tolist()[::-1] # Normally, rank from smallest to largest, so reverse
F1_validation = [F1_validation[i] for i in workflow_ranking]
# Only keep the number of RS required and resort based on ensemble
estimator.fitted_workflows =\
[estimator.fitted_workflows[i] for i in selected_workflows]
estimator.fitted_workflows =\
[estimator.fitted_workflows[i] for i in workflow_ranking]
# Store train and validation AUC
mean_val_F1 = F1_validation[0:maxlen]
F1_training = estimator.cv_results_['mean_train_score']
F1_training = [F1_training[i] for i in selected_workflows]
F1_training = [F1_training[i] for i in workflow_ranking]
mean_train_F1 = F1_training[0:maxlen]
performances[f'Mean training F1-score {key} top {maxlen}'] = mean_train_F1
performances[f'Mean validation F1-score {key} top {maxlen}'] = mean_val_F1
for ensemble in ensembles:
if ensemble <= RS:
print(f'\t Using ensemble {ensemble}.')
# Create the ensemble
estimator.create_ensemble(X_train_temp, Y_train, method=ensemble)
# Compute performance
y_prediction = estimator.predict(X_test)
y_score = estimator.predict_proba(X_test)[:, 1]
auc = roc_auc_score(Y_test, y_score)
f1_score_out = f1_score(Y_test, y_prediction, average='weighted')
performances[f'Test F1-score Ensemble {ensemble} {key}'] = f1_score_out
performances[f'Test AUC Ensemble {ensemble} {key}'] = auc
y_prediction = estimator.predict(X_train)
y_score = estimator.predict_proba(X_train)[:, 1]
auc = roc_auc_score(Y_train, y_score)
f1_score_out = f1_score(Y_train, y_prediction, average='weighted')
performances[f'Train F1-score Ensemble {ensemble} {key}'] = f1_score_out
performances[f'Train AUC Ensemble {ensemble} {key}'] = auc
# Write output
with open(output_json, 'w') as fp:
json.dump(performances, fp, sort_keys=True, indent=4)