Source code for WORC.plotting.plot_hyperparameters

#!/usr/bin/env python

# Copyright 2016-2020 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import pandas as pd
import WORC.addexceptions as ae


[docs]def plot_hyperparameters(prediction, label_type=None, estsize=50,
                         output=None, removeconstants=False, verbose=False):
    """Gather which hyperparameters have been used in the best workflows.

    Parameters
    ----------
    prediction: pandas dataframe or string, mandatory
        output of trainclassifier function, either a pandas dataframe
        or a HDF5 file

    estsize: integer, default 50
        Number of estimators that should be taken into account.

    output: filename of csv, default None
        Output file to write to. If None, not output is written, but just
        returned as a variable.

    removeconstants: boolean, default False
        Determine whether to remove any hyperparameters which have the same
        value in all workflows.

    verbose: boolean, default False
        Whether to show print messages or not.

    """
    # Load the prediction file
    if type(prediction) is not pd.core.frame.DataFrame:
        if os.path.isfile(prediction):
            prediction = pd.read_hdf(prediction)
        else:
            raise ae.WORCIOError(f'{prediction} is not an existing file!')

    # Select the estimator from the pandas dataframe to use
    keys = prediction.keys()
    if label_type is None:
        label_type = keys[0]
    elif len(label_type.split(',')) != 1:
        # Multiclass, just take the prediction label
        label_type = keys[0]

    prediction = prediction[label_type]

    # Loop over classifiers
    total = len(prediction.classifiers)
    for cnum, cls in enumerate(prediction.classifiers):
        if verbose:
            print(f'Extracting hyperparameters for iteration {cnum + 1} / {total}.')
        # Get parameters and select only a set number
        parameters = cls.cv_results_['params']
        if len(parameters) > estsize:
            parameters = parameters[0:estsize]

        # Additional information besides the parameters
        for i in range(0, estsize):
            # Add which (cross-validation) iteration is used and the rank
            parameters[i]['Iteration'] = cnum + 1
            parameters[i]['Rank'] = i + 1

            # Add some statistics
            parameters[i]['Metric'] = cls.scoring
            parameters[i]['mean_train_score'] =\
                cls.cv_results_['mean_train_score'][i]
            parameters[i]['mean_fit_time'] =\
                cls.cv_results_['mean_fit_time'][i]
            parameters[i]['std_train_score'] =\
                cls.cv_results_['std_train_score'][i]
            parameters[i]['generalization_score'] =\
                cls.cv_results_['generalization_score'][i]
            parameters[i]['rank_generalization_score'] =\
                cls.cv_results_['rank_generalization_score'][i]

            # NOTE: while this is called test score, it is the score on the
            # validation dataset(s)
            parameters[i]['mean_validation_score'] =\
                cls.cv_results_['mean_test_score'][i]
            parameters[i]['std_validation_score'] =\
                cls.cv_results_['std_test_score'][i]

        # Intialize data object if this is the first iteration
        if cnum == 0:
            data = {k: list() for k in parameters[i]}

        # Add to general data object
        for p in parameters:
            for k in p.keys():
                data[k].append(p[k])

    # Optionally, remove any hyperparameters which have the same
    # value in all workflows.
    n_parameters = len(list(data.keys()))
    if removeconstants:
        if verbose:
            print('Removing parameters with constant values.')

        keys = list(data.keys())
        for k in keys:
            # First convert all values to strings so we can use set
            tempdata = [str(i) for i in data[k]]

            # Count unique values, and if only one, delete
            n_unique = len(list(set(tempdata)))
            if n_unique == 1:
                if verbose:
                    print(f'\t Removing parameter {k}.')
                del data[k]

    # Write to csv if output name is provided
    if output is not None:
        if verbose:
            print(f'Writing output to {output}.')

        # First, specify order of columns for easy reading
        columns = list(data.keys())
        starters = ['Iteration', 'Rank', 'Metric', 'mean_validation_score',
                    'mean_train_score', 'mean_fit_time']
        for key in starters:
            columns.remove(key)
        columns = starters + columns

        # Write to dataframe
        df = pd.DataFrame(data)
        df.to_csv(output, index=False, columns=columns)

    # Display some information
    if verbose:
        print(f'Number of hyperparameters: {n_parameters}.')
        if removeconstants:
            n_parameters_unique = len(list(data.keys()))
            print(f'Number of hyperparameters with unique values: {n_parameters_unique}.')

    return data