Source code for WORC.plotting.plot_hyperparameters

#!/usr/bin/env python

# Copyright 2016-2020 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import pandas as pd
import WORC.addexceptions as ae


[docs]def plot_hyperparameters(prediction, label_type=None, estsize=50, output=None, removeconstants=False, verbose=False): """Gather which hyperparameters have been used in the best workflows. Parameters ---------- prediction: pandas dataframe or string, mandatory output of trainclassifier function, either a pandas dataframe or a HDF5 file estsize: integer, default 50 Number of estimators that should be taken into account. output: filename of csv, default None Output file to write to. If None, not output is written, but just returned as a variable. removeconstants: boolean, default False Determine whether to remove any hyperparameters which have the same value in all workflows. verbose: boolean, default False Whether to show print messages or not. """ # Load the prediction file if type(prediction) is not pd.core.frame.DataFrame: if os.path.isfile(prediction): prediction = pd.read_hdf(prediction) else: raise ae.WORCIOError(f'{prediction} is not an existing file!') # Select the estimator from the pandas dataframe to use keys = prediction.keys() if label_type is None: label_type = keys[0] elif len(label_type.split(',')) != 1: # Multiclass, just take the prediction label label_type = keys[0] prediction = prediction[label_type] # Loop over classifiers total = len(prediction.classifiers) for cnum, cls in enumerate(prediction.classifiers): if verbose: print(f'Extracting hyperparameters for iteration {cnum + 1} / {total}.') # Get parameters and select only a set number parameters = cls.cv_results_['params'] if len(parameters) > estsize: parameters = parameters[0:estsize] # Additional information besides the parameters for i in range(0, estsize): # Add which (cross-validation) iteration is used and the rank parameters[i]['Iteration'] = cnum + 1 parameters[i]['Rank'] = i + 1 # Add some statistics parameters[i]['Metric'] = cls.scoring parameters[i]['mean_train_score'] =\ cls.cv_results_['mean_train_score'][i] parameters[i]['mean_fit_time'] =\ cls.cv_results_['mean_fit_time'][i] parameters[i]['std_train_score'] =\ cls.cv_results_['std_train_score'][i] parameters[i]['generalization_score'] =\ cls.cv_results_['generalization_score'][i] parameters[i]['rank_generalization_score'] =\ cls.cv_results_['rank_generalization_score'][i] # NOTE: while this is called test score, it is the score on the # validation dataset(s) parameters[i]['mean_validation_score'] =\ cls.cv_results_['mean_test_score'][i] parameters[i]['std_validation_score'] =\ cls.cv_results_['std_test_score'][i] # Intialize data object if this is the first iteration if cnum == 0: data = {k: list() for k in parameters[i]} # Add to general data object for p in parameters: for k in p.keys(): data[k].append(p[k]) # Optionally, remove any hyperparameters which have the same # value in all workflows. n_parameters = len(list(data.keys())) if removeconstants: if verbose: print('Removing parameters with constant values.') keys = list(data.keys()) for k in keys: # First convert all values to strings so we can use set tempdata = [str(i) for i in data[k]] # Count unique values, and if only one, delete n_unique = len(list(set(tempdata))) if n_unique == 1: if verbose: print(f'\t Removing parameter {k}.') del data[k] # Write to csv if output name is provided if output is not None: if verbose: print(f'Writing output to {output}.') # First, specify order of columns for easy reading columns = list(data.keys()) starters = ['Iteration', 'Rank', 'Metric', 'mean_validation_score', 'mean_train_score', 'mean_fit_time'] for key in starters: columns.remove(key) columns = starters + columns # Write to dataframe df = pd.DataFrame(data) df.to_csv(output, index=False, columns=columns) # Display some information if verbose: print(f'Number of hyperparameters: {n_parameters}.') if removeconstants: n_parameters_unique = len(list(data.keys())) print(f'Number of hyperparameters with unique values: {n_parameters_unique}.') return data