Source code for WORC.classification.parameter_optimization

#!/usr/bin/env python

# Copyright 2016-2020 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
from sklearn.utils import check_random_state
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
from WORC.classification.SearchCV import RandomizedSearchCVfastr, RandomizedSearchCVJoblib, GuidedSearchCVSMAC


[docs]def random_search_parameters(features, labels, N_iter, test_size,
                             param_grid, scoring_method, n_splits=5,
                             n_jobspercore=200, use_fastr=False,
                             n_cores=1, fastr_plugin=None,
                             memory='2G', maxlen=100, ranking_score='test_score',
                             random_seed=None,
                             refit_training_workflows=False,
                             refit_validation_workflows=False):
    """
    Train a classifier and simultaneously optimizes hyperparameters using a
    randomized search.

    Arguments:
        features: numpy array containing the training features.
        labels: list containing the object labels to be trained on.
        N_iter: integer listing the number of iterations to be used in the
                hyperparameter optimization.
        test_size: float listing the test size percentage used in the cross
                   validation.
        classifier: sklearn classifier to be tested
        param_grid: dictionary containing all possible hyperparameters and their
                    values or distrubitions.
        scoring_method: string defining scoring method used in optimization,
                        e.g. f1_weighted for a SVM.
        n_jobsperscore: integer listing the number of jobs that are ran on a
                        single core when using the fastr randomized search.
        use_fastr: Boolean determining of either fastr or joblib should be used
                   for the opimization.
        fastr_plugin: determines which plugin is used for fastr executions.
                When None, uses the default plugin from the fastr config.

    Returns:
        random_search: sklearn randomsearch object containing the results.
    """
    if random_seed is None:
        random_seed = np.random.randint(1, 5000)

    random_state = check_random_state(random_seed)

    regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet']
    if any(clf in regressors for clf in param_grid['classifiers']):
        # We cannot do a stratified shuffle split with regression
        cv = ShuffleSplit(n_splits=n_splits, test_size=test_size,
                          random_state=random_state)
    else:
        cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size,
                                    random_state=random_state)

    if use_fastr:
        random_search = RandomizedSearchCVfastr(param_distributions=param_grid,
                                                n_iter=N_iter,
                                                scoring=scoring_method,
                                                n_jobs=n_cores,
                                                n_jobspercore=n_jobspercore,
                                                maxlen=maxlen,
                                                verbose=1, cv=cv,
                                                fastr_plugin=fastr_plugin,
                                                memory=memory,
                                                ranking_score=ranking_score,
                                                refit_training_workflows=refit_training_workflows,
                                                refit_validation_workflows=refit_validation_workflows)
    else:
        random_search = RandomizedSearchCVJoblib(param_distributions=param_grid,
                                                 n_iter=N_iter,
                                                 scoring=scoring_method,
                                                 n_jobs=n_cores,
                                                 n_jobspercore=n_jobspercore,
                                                 maxlen=maxlen,
                                                 verbose=1, cv=cv,
                                                 fastr_plugin=fastr_plugin,
                                                 memory=memory,
                                                 ranking_score=ranking_score,
                                                 refit_training_workflows=refit_training_workflows,
                                                 refit_validation_workflows=refit_validation_workflows)
    random_search.fit(features, labels)
    print("Best found parameters:")
    for i in random_search.best_params_:
        print(f'{i}: {random_search.best_params_[i]}.')
    print(f"\n Best score using best parameters: {scoring_method} = {random_search.best_score_}")

    return random_search


[docs]def guided_search_parameters(features, labels, N_iter, test_size,
                             parameters, scoring_method, n_splits=5,
                             n_jobspercore=200, use_fastr=False,
                             n_cores=1, fastr_plugin=None,
                             memory='2G', maxlen=100, ranking_score='test_score',
                             random_seed=None, refit_training_workflows=False,
                             refit_validation_workflows=False,
                             smac_result_file=None):
    """
    Train a classifier and simultaneously optimizes hyperparameters using a
    Bayesian optimization approach.

    Arguments:
        features: numpy array containing the training features.
        labels: list containing the object labels to be trained on.
        N_iter: integer listing the number of iterations to be used in the
                hyperparameter optimization.
        test_size: float listing the test size percentage used in the cross
                   validation.
        classifier: sklearn classifier to be tested
        param_grid: dictionary containing all possible hyperparameters and their
                    values or distrubitions.
        scoring_method: string defining scoring method used in optimization,
                        e.g. f1_weighted for a SVM.
        n_jobsperscore: integer listing the number of jobs that are ran on a
                        single core when using the fastr randomized search.
        use_fastr: Boolean determining of either fastr or joblib should be used
                   for the opimization.
        fastr_plugin: determines which plugin is used for fastr executions.
                When None, uses the default plugin from the fastr config.

    Returns:
        guided_search: object containing the results
    """
    if random_seed is None:
        #random_seed = np.random.randint(1, 5000)
        # Fix the random seed for testing
        random_seed = 42
    random_state = check_random_state(random_seed)

    regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet']
    if any(clf in regressors for clf in parameters['Classification']['classifiers']):
        # We cannot do a stratified shuffle split with regression
        cv = ShuffleSplit(n_splits=n_splits, test_size=test_size,
                          random_state=random_state)
    else:
        cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size,
                                    random_state=random_state)

    guided_search = GuidedSearchCVSMAC(param_distributions=parameters,
                                       n_iter=N_iter,
                                       scoring=scoring_method,
                                       n_jobs=n_cores,
                                       n_jobspercore=n_jobspercore,
                                       maxlen=maxlen,
                                       verbose=1, cv=cv,
                                       fastr_plugin=fastr_plugin,
                                       ranking_score=ranking_score,
                                       features=features,
                                       labels=labels,
                                       smac_result_file=smac_result_file,
                                       refit_training_workflows=refit_training_workflows,
                                       refit_validation_workflows=refit_validation_workflows)

    guided_search.fit(features, labels)
    print("Best found parameters:")
    for i in guided_search.best_params_:
        print(f'{i}: {guided_search.best_params_[i]}.')
    print("\n Best score using best parameters:")
    print(guided_search.best_score_)

    return guided_search