Source code for WORC.classification.parameter_optimization

#!/usr/bin/env python

# Copyright 2016-2020 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
from sklearn.utils import check_random_state
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
from WORC.classification.SearchCV import RandomizedSearchCVfastr, RandomizedSearchCVJoblib, GuidedSearchCVSMAC


[docs]def random_search_parameters(features, labels, N_iter, test_size, param_grid, scoring_method, n_splits=5, n_jobspercore=200, use_fastr=False, n_cores=1, fastr_plugin=None, memory='2G', maxlen=100, ranking_score='test_score', random_seed=None, refit_training_workflows=False, refit_validation_workflows=False): """ Train a classifier and simultaneously optimizes hyperparameters using a randomized search. Arguments: features: numpy array containing the training features. labels: list containing the object labels to be trained on. N_iter: integer listing the number of iterations to be used in the hyperparameter optimization. test_size: float listing the test size percentage used in the cross validation. classifier: sklearn classifier to be tested param_grid: dictionary containing all possible hyperparameters and their values or distrubitions. scoring_method: string defining scoring method used in optimization, e.g. f1_weighted for a SVM. n_jobsperscore: integer listing the number of jobs that are ran on a single core when using the fastr randomized search. use_fastr: Boolean determining of either fastr or joblib should be used for the opimization. fastr_plugin: determines which plugin is used for fastr executions. When None, uses the default plugin from the fastr config. Returns: random_search: sklearn randomsearch object containing the results. """ if random_seed is None: random_seed = np.random.randint(1, 5000) random_state = check_random_state(random_seed) regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet'] if any(clf in regressors for clf in param_grid['classifiers']): # We cannot do a stratified shuffle split with regression cv = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state) else: cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state) if use_fastr: random_search = RandomizedSearchCVfastr(param_distributions=param_grid, n_iter=N_iter, scoring=scoring_method, n_jobs=n_cores, n_jobspercore=n_jobspercore, maxlen=maxlen, verbose=1, cv=cv, fastr_plugin=fastr_plugin, memory=memory, ranking_score=ranking_score, refit_training_workflows=refit_training_workflows, refit_validation_workflows=refit_validation_workflows) else: random_search = RandomizedSearchCVJoblib(param_distributions=param_grid, n_iter=N_iter, scoring=scoring_method, n_jobs=n_cores, n_jobspercore=n_jobspercore, maxlen=maxlen, verbose=1, cv=cv, fastr_plugin=fastr_plugin, memory=memory, ranking_score=ranking_score, refit_training_workflows=refit_training_workflows, refit_validation_workflows=refit_validation_workflows) random_search.fit(features, labels) print("Best found parameters:") for i in random_search.best_params_: print(f'{i}: {random_search.best_params_[i]}.') print(f"\n Best score using best parameters: {scoring_method} = {random_search.best_score_}") return random_search
[docs]def guided_search_parameters(features, labels, N_iter, test_size, parameters, scoring_method, n_splits=5, n_jobspercore=200, use_fastr=False, n_cores=1, fastr_plugin=None, memory='2G', maxlen=100, ranking_score='test_score', random_seed=None, refit_training_workflows=False, refit_validation_workflows=False, smac_result_file=None): """ Train a classifier and simultaneously optimizes hyperparameters using a Bayesian optimization approach. Arguments: features: numpy array containing the training features. labels: list containing the object labels to be trained on. N_iter: integer listing the number of iterations to be used in the hyperparameter optimization. test_size: float listing the test size percentage used in the cross validation. classifier: sklearn classifier to be tested param_grid: dictionary containing all possible hyperparameters and their values or distrubitions. scoring_method: string defining scoring method used in optimization, e.g. f1_weighted for a SVM. n_jobsperscore: integer listing the number of jobs that are ran on a single core when using the fastr randomized search. use_fastr: Boolean determining of either fastr or joblib should be used for the opimization. fastr_plugin: determines which plugin is used for fastr executions. When None, uses the default plugin from the fastr config. Returns: guided_search: object containing the results """ if random_seed is None: #random_seed = np.random.randint(1, 5000) # Fix the random seed for testing random_seed = 42 random_state = check_random_state(random_seed) regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet'] if any(clf in regressors for clf in parameters['Classification']['classifiers']): # We cannot do a stratified shuffle split with regression cv = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state) else: cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state) guided_search = GuidedSearchCVSMAC(param_distributions=parameters, n_iter=N_iter, scoring=scoring_method, n_jobs=n_cores, n_jobspercore=n_jobspercore, maxlen=maxlen, verbose=1, cv=cv, fastr_plugin=fastr_plugin, ranking_score=ranking_score, features=features, labels=labels, smac_result_file=smac_result_file, refit_training_workflows=refit_training_workflows, refit_validation_workflows=refit_validation_workflows) guided_search.fit(features, labels) print("Best found parameters:") for i in guided_search.best_params_: print(f'{i}: {guided_search.best_params_[i]}.') print("\n Best score using best parameters:") print(guided_search.best_score_) return guided_search