Source code for WORC.classification.SearchCV

#!/usr/bin/env python

# Copyright 2016-2022 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from abc import ABCMeta, abstractmethod
from collections.abc import Sized
import numpy as np
import warnings
import numbers
import random
import string
import fastr
from fastr.api import ResourceLimit
from joblib import Parallel, delayed
from scipy.stats import rankdata
import six
import pandas as pd
import json
import glob
from itertools import islice
import shutil

from sklearn.model_selection._search import ParameterSampler
from sklearn.model_selection._search import ParameterGrid, _check_param_grid
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, is_classifier, clone
from sklearn.base import MetaEstimatorMixin
from sklearn.exceptions import NotFittedError
from sklearn.utils.metaestimators import if_delegate_has_method
from sklearn.utils.validation import indexable, check_is_fitted
from sklearn.model_selection._split import check_cv
from sklearn.metrics import f1_score, roc_auc_score, mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.utils.validation import _check_fit_params
from sklearn.model_selection._validation import _aggregate_score_dicts

from WORC.classification.fitandscore import fit_and_score, replacenan
from WORC.classification.metrics import check_multimetric_scoring
from WORC.classification import construct_classifier as cc
from WORC.featureprocessing.Preprocessor import Preprocessor
from WORC.detectors.detectors import DebugDetector
import WORC.addexceptions as WORCexceptions

# Imports used in the Bayesian optimization
from WORC.classification.smac import build_smac_config
from datetime import datetime
import copy


[docs]def rms_score(truth, prediction): """Root-mean-square-error metric.""" return np.sqrt(mean_squared_error(truth, prediction))
[docs]def sar_score(truth, prediction): """SAR metric from Caruana et al. 2004.""" ROC = roc_auc_score(truth, prediction) # Convert score to binaries first for num in range(0, len(prediction)): if prediction[num] >= 0.5: prediction[num] = 1 else: prediction[num] = 0 ACC = accuracy_score(truth, prediction) RMS = rms_score(truth, prediction) SAR = (ACC + ROC + (1 - RMS))/3 return SAR
[docs]def chunksdict(data, SIZE): """Split a dictionary in equal parts of certain slice.""" it = iter(data) for i in range(0, len(data), SIZE): yield {k: data[k] for k in islice(it, SIZE)}
[docs]def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n]
[docs]class Ensemble(six.with_metaclass(ABCMeta, BaseEstimator, MetaEstimatorMixin)): """Ensemble of BaseSearchCV Estimators.""" # @abstractmethod
[docs] def __init__(self, estimators): """Initialize object with list of estimators.""" if not estimators: message = 'You supplied an empty list of estimators: No ensemble creation possible.' raise WORCexceptions.WORCValueError(message) self.estimators = estimators self.n_estimators = len(estimators)
[docs] def predict(self, X): """Call predict on the estimator with the best found parameters. Only available if ``refit=True`` and the underlying estimator supports ``predict``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ self.estimators[0]._check_is_fitted('predict') # Check if we are dealing with multilabel if len(self.estimators[0].predict(X).shape) == 1: nlabels = 1 else: nlabels = self.estimators[0].predict(X).shape[1] if type(self.estimators[0].best_estimator_) == OneVsRestClassifier: multilabel = True elif nlabels > 1: multilabel = True else: multilabel = False if multilabel: # Multilabel outcome = np.zeros((self.n_estimators, len(X), nlabels)) for num, est in enumerate(self.estimators): if hasattr(est, 'predict_proba'): # BUG: SVM kernel can be wrong type if hasattr(est.best_estimator_, 'kernel'): est.best_estimator_.kernel = str(est.best_estimator_.kernel) outcome[num, :, :] = est.predict_proba(X) else: outcome[num, :, :] = est.predict(X) # Replace NAN if they are there if np.isnan(outcome).any(): print('[WARNING] Predictions contain NaN, removing those rows.') outcome = outcome[~np.isnan(outcome).any(axis=1)] outcome = np.squeeze(np.mean(outcome, axis=0)) # NOTE: Binarize specifically for multiclass for i in range(0, outcome.shape[0]): label = np.argmax(outcome[i, :]) outcome[i, :] = np.zeros(outcome.shape[1]) outcome[i, label] = 1 else: # Singlelabel outcome = np.zeros((self.n_estimators, len(X))) for num, est in enumerate(self.estimators): if hasattr(est, 'predict_proba'): # BUG: SVM kernel can be wrong type if hasattr(est.best_estimator_, 'kernel'): est.best_estimator_.kernel = str(est.best_estimator_.kernel) outcome[num, :] = est.predict_proba(X)[:, 1] else: outcome[num, :] = est.predict(X) # Replace NAN if they are there outcome = outcome[~np.isnan(outcome).any(axis=1)] outcome = np.squeeze(np.mean(outcome, axis=0)) # Binarize isclassifier = is_classifier(est.best_estimator_) if isclassifier: outcome[outcome >= 0.5] = 1 outcome[outcome < 0.5] = 0 return outcome
[docs] def predict_proba(self, X): """Call predict_proba on the estimator with the best found parameters. Only available if ``refit=True`` and the underlying estimator supports ``predict_proba``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ self.estimators[0]._check_is_fitted('predict_proba') # Check if we are dealing with multilabel if len(self.estimators[0].predict(X).shape) == 1: nlabels = 1 else: nlabels = self.estimators[0].predict(X).shape[1] if type(self.estimators[0].best_estimator_) == OneVsRestClassifier: multilabel = True elif nlabels > 1: multilabel = True else: multilabel = False if multilabel: # Multilabel outcome = np.zeros((self.n_estimators, len(X), nlabels)) for num, est in enumerate(self.estimators): if hasattr(est, 'predict_proba'): # BUG: SVM kernel can be wrong type if hasattr(est.best_estimator_, 'kernel'): est.best_estimator_.kernel = str(est.best_estimator_.kernel) outcome[num, :, :] = est.predict_proba(X) else: outcome[num, :, :] = est.predict(X) # Replace NAN if they are there if np.isnan(outcome).any(): print('[WARNING] Predictions contain NaN, removing those rows.') outcome = outcome[~np.isnan(outcome).any(axis=1)] outcome = np.squeeze(np.mean(outcome, axis=0)) else: # Single label # For probabilities, we get both a class0 and a class1 score outcome = np.zeros((len(X), 2)) outcome_class1 = np.zeros((self.n_estimators, len(X))) outcome_class2 = np.zeros((self.n_estimators, len(X))) for num, est in enumerate(self.estimators): # BUG: SVM kernel can be wrong type if hasattr(est.best_estimator_, 'kernel'): est.best_estimator_.kernel = str(est.best_estimator_.kernel) outcome_class1[num, :] = est.predict_proba(X)[:, 0] outcome_class2[num, :] = est.predict_proba(X)[:, 1] outcome[:, 0] = np.squeeze(np.mean(outcome_class1, axis=0)) outcome[:, 1] = np.squeeze(np.mean(outcome_class2, axis=0)) return outcome
[docs] def predict_log_proba(self, X): """Call predict_log_proba on the estimator with the best found parameters. Only available if ``refit=True`` and the underlying estimator supports ``predict_log_proba``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ self.estimators[0]._check_is_fitted('predict_log_proba') outcome = np.zeros((self.n_estimators, len(X))) for num, est in enumerate(self.estimators): outcome[num, :] = est.predict_log_proba(X) outcome = np.squeeze(np.mean(outcome, axis=0)) return outcome
[docs] def decision_function(self, X): """Call decision_function on the estimator with the best found parameters. Only available if ``refit=True`` and the underlying estimator supports ``decision_function``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ self.estimators[0]._check_is_fitted('decision_function') # NOTE: Check if we are dealing with multilabel if type(self.estimators[0].best_estimator_) == OneVsRestClassifier: # Multilabel nlabels = self.estimators[0].decision_function(X).shape[1] outcome = np.zeros((self.n_estimators, len(X), nlabels)) for num, est in enumerate(self.estimators): outcome[num, :, :] = est.decision_function(X) outcome = np.squeeze(np.mean(outcome, axis=0)) else: # Singlelabel outcome = np.zeros((self.n_estimators, len(X))) for num, est in enumerate(self.estimators): outcome[num, :] = est.decision_function(X) outcome = np.squeeze(np.mean(outcome, axis=0)) return outcome
[docs] def transform(self, X): """Call transform on the estimator with the best found parameters. Only available if the underlying estimator supports ``transform`` and ``refit=True``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ self.estimators[0]._check_is_fitted('transform') outcome = np.zeros((self.n_estimators, len(X))) for num, est in enumerate(self.estimators): outcome[num, :] = est.transform(X) outcome = np.squeeze(np.mean(outcome, axis=0)) return outcome
[docs] def inverse_transform(self, Xt): """Call inverse_transform on the estimator with the best found params. Only available if the underlying estimator implements ``inverse_transform`` and ``refit=True``. Parameters ----------- Xt : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ self.estimators[0]._check_is_fitted('inverse_transform') outcome = np.zeros((self.n_estimators, len(Xt))) for num, est in enumerate(self.estimators): outcome[num, :] = est.transform(Xt) outcome = np.squeeze(np.mean(outcome, axis=0)) return outcome
[docs]class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator, MetaEstimatorMixin)): """Base class for hyper parameter search with cross-validation."""
[docs] @abstractmethod def __init__(self, param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, maxlen=100, fastr_plugin=None, memory='2G', ranking_score='test_score', refit_training_workflows=False, ensemble_validation_score=None, refit_validation_workflows=False): """Initialize SearchCV Object.""" # Added for fastr and joblib executions self.param_distributions = param_distributions self.n_iter = n_iter self.n_jobspercore = n_jobspercore self.random_state = random_state self.ensemble = list() self.fastr_plugin = fastr_plugin self.memory = memory self.ensemble_validation_score = ensemble_validation_score # Below are the defaults from sklearn self.scoring = scoring self.n_jobs = n_jobs self.fit_params = fit_params if fit_params is not None else {} self.iid = iid self.refit = refit self.cv = cv self.verbose = verbose self.pre_dispatch = pre_dispatch # Manually added steps self.error_score = error_score self.return_train_score = return_train_score self.maxlen = maxlen self.ranking_score = ranking_score self.refit_training_workflows = refit_training_workflows self.refit_validation_workflows = refit_validation_workflows self.fitted_workflows = list() self.fitted_validation_workflows = list() # Only for WORC Paper self.test_RS = True
@property def _estimator_type(self): return self.estimator._estimator_type
[docs] def score(self, X, y=None): """Compute the score (i.e. probability) on a given data. This uses the score defined by ``scoring`` where provided, and the ``best_estimator_.score`` method otherwise. Parameters ---------- X : array-like, shape = [n_samples, n_features] Input data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. Returns ------- score : float """ if self.scorer_ is None: raise ValueError("No score function explicitly defined, " "and the estimator doesn't provide one %s" % self.best_estimator_) X, y = self.preprocess(X, y) return self.scorer_(self.best_estimator_, X, y)
def _check_is_fitted(self, method_name): if not self.refit: raise NotFittedError(('This SearchCV instance was initialized ' 'with refit=False. %s is ' 'available only after refitting on the best ' 'parameters. ') % method_name) else: check_is_fitted(self, 'best_estimator_')
[docs] @if_delegate_has_method(delegate=('best_estimator_', 'estimator')) def predict(self, X): """Call predict on the estimator with the best found parameters. Only available if ``refit=True`` and the underlying estimator supports ``predict``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ self._check_is_fitted('predict') if self.ensemble: return self.ensemble.predict(X) else: X, _ = self.preprocess(X) return self.best_estimator_.predict(X)
[docs] @if_delegate_has_method(delegate=('best_estimator_', 'estimator')) def predict_proba(self, X): """Call predict_proba on the estimator with the best found parameters. Only available if ``refit=True`` and the underlying estimator supports ``predict_proba``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ self._check_is_fitted('predict_proba') # BUG: kernel sometimes saved as unicode # BUG: SVM kernel can be wrong type if hasattr(self.best_estimator_, 'kernel'): self.best_estimator_.kernel = str(self.best_estimator_.kernel) if self.ensemble: return self.ensemble.predict_proba(X) else: X, _ = self.preprocess(X) return self.best_estimator_.predict_proba(X)
[docs] @if_delegate_has_method(delegate=('best_estimator_', 'estimator')) def predict_log_proba(self, X): """Call predict_log_proba on the estimator with the best found parameters. Only available if ``refit=True`` and the underlying estimator supports ``predict_log_proba``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ self._check_is_fitted('predict_log_proba') # BUG: SVM kernel can be wrong type if hasattr(self.est.best_estimator_, 'kernel'): self.best_estimator_.kernel = str(self.best_estimator_.kernel) if self.ensemble: return self.ensemble.predict_log_proba(X) else: X, _ = self.preprocess(X) return self.best_estimator_.predict_log_proba(X)
[docs] @if_delegate_has_method(delegate=('best_estimator_', 'estimator')) def decision_function(self, X): """Call decision_function on the estimator with the best found parameters. Only available if ``refit=True`` and the underlying estimator supports ``decision_function``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ self._check_is_fitted('decision_function') if self.ensemble: return self.ensemble.decision_function(X) else: X, _ = self.preprocess(X) return self.best_estimator_.decision_function(X)
[docs] @if_delegate_has_method(delegate=('best_estimator_', 'estimator')) def transform(self, X): """Call transform on the estimator with the best found parameters. Only available if the underlying estimator supports ``transform`` and ``refit=True``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ self._check_is_fitted('transform') if self.ensemble: return self.ensemble.transform(X) else: X = self.preprocess(X) return self.best_estimator_.transform(X)
[docs] @if_delegate_has_method(delegate=('best_estimator_', 'estimator')) def inverse_transform(self, Xt): """Call inverse_transform on the estimator with the best found params. Only available if the underlying estimator implements ``inverse_transform`` and ``refit=True``. Parameters ----------- Xt : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ self._check_is_fitted('inverse_transform') if self.ensemble: return self.ensemble.transform(Xt) else: Xt, _ = self.preprocess(Xt) return self.best_estimator_.transform(Xt)
[docs] def preprocess(self, X, y=None, training=False): """Apply the available preprocssing methods to the features.""" if self.best_preprocessor is not None: X = self.best_preprocessor.transform(X) if self.best_encoder is not None: X = self.best_encoder.transform(X) if self.best_imputer is not None: X = self.best_imputer.transform(X) # Replace nan if still left X = replacenan(np.asarray(X)).tolist() if self.best_groupsel is not None: X = self.best_groupsel.transform(X) if self.best_varsel is not None: X = self.best_varsel.transform(X) if not training and hasattr(self, 'overfit_scaler') and self.overfit_scaler: # Overfit the feature scaling on the test set # NOTE: Never use this in an actual model, only to assess how # different your features are in your train and test sets m = '[WORC WARNING] You choose to overfit the feature scaling. ' +\ 'Never use this in an actual model, only to assess how ' +\ 'different your features are in your train and test sets.' print(m) scaler = StandardScaler().fit(X) if scaler is not None: X = scaler.transform(X) else: if self.best_scaler is not None: X = self.best_scaler.transform(X) if self.best_reliefsel is not None: X = self.best_reliefsel.transform(X) if self.best_modelsel is not None: X = self.best_modelsel.transform(X) if self.best_statisticalsel is not None: X = self.best_statisticalsel.transform(X) if self.best_rfesel is not None: X = self.best_rfesel.transform(X) if self.best_pca is not None: X = self.best_pca.transform(X) # Only resampling in training phase, i.e. if we have the labels if y is not None: if self.best_Sampler is not None: X, y = self.best_Sampler.transform(X, y) return X, y
[docs] def process_fit(self, n_splits, parameters_all, test_sample_counts, test_score_dicts, train_score_dicts, fit_time, score_time, cv_iter, X, y, fitted_workflows=list(), fitted_validation_workflows=list(), use_smac=False): """Process a fit. Process the outcomes of a SearchCV fit and find the best settings over all cross validations from all hyperparameters tested Very similar to the _format_results function or the original SearchCV. """ # test_score_dicts and train_score dicts are lists of dictionaries and # we make them into dict of lists if self.verbose: print('Processing fits.') test_scores = _aggregate_score_dicts(test_score_dicts) if self.return_train_score: train_scores = _aggregate_score_dicts(train_score_dicts) # We take only one result per split, default by sklearn pipelines_per_split = int(len(parameters_all) / n_splits) # Change the list of parameters based on the shape of the input if use_smac: candidate_params_all = list(parameters_all[::n_splits]) else: candidate_params_all = list(parameters_all[:pipelines_per_split]) n_candidates = len(candidate_params_all) # Store some of the resulting scores results = dict() # Computed the (weighted) mean and std for test scores alone def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" # Change processing based on the shape of the input if use_smac: array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) else: array = np.transpose(np.array(array, dtype=np.float64).reshape(n_splits, n_candidates)) if splits: for split_i in range(n_splits): results["split%d_%s" % (split_i, key_name)] = array[:, split_i] try: array_means = np.average(array, axis=1, weights=weights) except ZeroDivisionError as e: e = f'[WORC Warning] {e}. Setting {key_name} to unweighted.' print(e) array_means = np.average(array, axis=1) results['mean_%s' % key_name] = array_means array_mins = np.min(array, axis=1) results['min_%s' % key_name] = array_mins # Weighted std is not directly available in numpy try: array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights)) except ZeroDivisionError as e: e = f'[WORC Warning] {e}. Setting {key_name} to unweighted.' print(e) array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis]) ** 2, axis=1)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray( rankdata(-array_means, method='min'), dtype=np.int32) _store('fit_time', fit_time) _store('score_time', score_time) # Store scores # Check whether to do multimetric scoring test_estimator = cc.construct_classifier(candidate_params_all[0]) scorers, self.multimetric_ = check_multimetric_scoring( test_estimator, scoring=self.scoring) # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) if self.iid != 'deprecated': warnings.warn( "The parameter 'iid' is deprecated in 0.22 and will be " "removed in 0.24.", FutureWarning ) iid = self.iid else: iid = False icheck = 0 for scorer_name in scorers.keys(): # Computed the (weighted) mean and std for test scores alone key_name = 'test_%s' % scorer_name _store('test_%s' % scorer_name, test_scores[scorer_name], splits=True, rank=True, weights=test_sample_counts if iid else None) if DebugDetector().do_detection() and icheck == 0: # Check the scores for some splits for i in range(10): print('Iteration: ' + str(i)) print(test_scores[scorer_name][i]) print(results["split%d_%s" % (0, key_name)][i]) print(test_scores[scorer_name][i + 10]) print(results["split%d_%s" % (1, key_name)][i]) print(results['mean_%s' % key_name][i]) print('\n') icheck += 1 if self.return_train_score: _store('train_%s' % scorer_name, train_scores[scorer_name], splits=True) # Compute the "Generalization" score difference_score = abs(results['mean_train_score'] - results['mean_test_score']) generalization_score = results['mean_test_score'] - difference_score results['generalization_score'] = generalization_score results['rank_generalization_score'] = np.asarray( rankdata(-results['generalization_score'], method='min'), dtype=np.int32) if self.multimetric_: if self.refit is not False and ( not isinstance(self.refit, str) or # This will work for both dict / list (tuple) self.refit not in scorers) and not callable(self.refit): raise ValueError("For multi-metric scoring, the parameter " "refit must be set to a scorer key or a " "callable to refit an estimator with the " "best parameter setting on the whole " "data and make the best_* attributes " "available for that metric. If this is " "not needed, refit should be set to " "False explicitly. %r was passed." % self.refit) else: refit_metric = self.refit else: refit_metric = 'score' # For multi-metric evaluation, store the best_index_, best_params_ and # best_score_ iff refit is one of the scorer names # In single metric evaluation, refit_metric is "score" if self.refit or not self.multimetric_: # If callable, refit is expected to return the index of the best # parameter set. if callable(self.refit): self.best_index_ = self.refit(results) if not isinstance(self.best_index_, numbers.Integral): raise TypeError('best_index_ returned is not an integer') if (self.best_index_ < 0 or self.best_index_ >= len(results["params"])): raise IndexError('best_index_ index out of range') else: self.best_index_ = results["rank_test_%s" % refit_metric].argmin() self.best_score_ = results["mean_test_%s" % refit_metric][ self.best_index_] self.best_params_ = candidate_params_all[self.best_index_] # Rank the indices of scores from all parameter settings ranked_test_scores = results["rank_" + self.ranking_score] indices = range(0, len(ranked_test_scores)) sortedindices = [x for _, x in sorted(zip(ranked_test_scores, indices))] # In order to reduce the memory used, we will only save # a maximum of results maxlen = min(self.maxlen, n_candidates) bestindices = sortedindices[0:maxlen] candidate_params_all = np.asarray(candidate_params_all)[bestindices].tolist() for k in results.keys(): results[k] = results[k][bestindices] results['params'] = candidate_params_all # Calculate and store the total_fit_time of this train/test CV results['total_fit_time'] = np.sum(fit_time) # Store the atributes of the best performing estimator best_index = np.flatnonzero(results["rank_" + self.ranking_score] == 1)[0] best_parameters_all = candidate_params_all[best_index] # Store several objects self.cv_results_ = results self.n_splits_ = n_splits self.cv_iter = cv_iter self.best_index_ = best_index self.best_params_ = results["params"][self.best_index_] if self.refit: # We always refit on the full dataset indices = np.arange(0, len(y)) self.refit_and_score(X, y, best_parameters_all, train=indices, test=indices) # Store the only scorer not as a dict for single metric evaluation self.scorer_ = scorers if self.multimetric_ else scorers['score'] # Refit the top performing workflows on the full training dataset if self.refit_training_workflows and fitted_workflows: # Select only from one train-val split, as they are identical fitted_workflows = fitted_workflows[:pipelines_per_split] # Sort according to best indices fitted_workflows = [fitted_workflows[i] for i in bestindices] self.fitted_workflows = fitted_workflows if self.refit_validation_workflows and fitted_validation_workflows: # Select from all train-val splits the best indices bestindices_all = list() for j in range(len(cv_iter)): bestindices_all.extend([i + n_candidates * j for i in bestindices]) fitted_validation_workflows =\ [fitted_validation_workflows[i] for i in bestindices_all] self.fitted_validation_workflows = fitted_validation_workflows return self
[docs] def refit_and_score(self, X, y, parameters_all, train, test, verbose=None): """Refit the base estimator and attributes such as GroupSel. Parameters ---------- X: array, mandatory Array containingfor each object (rows) the feature values (1st Column) and the associated feature label (2nd Column). y: list(?), mandatory List containing the labels of the objects. parameters_all: dictionary, mandatory Contains the settings used for the all preprocessing functions and the fitting. TODO: Create a default object and show the fields. train: list, mandatory Indices of the objects to be used as training set. test: list, mandatory Indices of the objects to be used as testing set. """ if verbose is None: verbose = self.verbose # Preprocess features if required if 'FeatPreProcess' in parameters_all: if parameters_all['FeatPreProcess'] == 'True': print("Preprocessing features.") feature_values = np.asarray([x[0] for x in X]) feature_labels = np.asarray([x[1] for x in X]) preprocessor = Preprocessor(verbose=False) preprocessor.fit(feature_values, feature_labels=feature_labels[0, :]) feature_values = preprocessor.transform(feature_values) feature_labels = preprocessor.transform(feature_labels) X_fit = [(values, labels) for values, labels in zip(feature_values, feature_labels)] else: X_fit = X preprocessor = None else: X_fit = X preprocessor = None # Refit all preprocessing functions fit_params = _check_fit_params(X_fit, self.fit_params) out = fit_and_score(X_fit, y, self.scoring, train, test, parameters_all, fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=False, return_estimator=True, error_score=self.error_score, verbose=verbose, return_all=True, skip=True) # Associate best options with new fits (save_data, GroupSel, VarSel, SelectModel, feature_labels, scalers, encoders, Imputers, PCAs, StatisticalSel, RFESel, ReliefSel, Sampler) = out fitted_estimator = save_data[-2] self.best_groupsel = GroupSel self.best_scaler = scalers self.best_varsel = VarSel self.best_modelsel = SelectModel self.best_preprocessor = preprocessor self.best_imputer = Imputers self.best_encoder = encoders self.best_pca = PCAs self.best_featlab = feature_labels self.best_statisticalsel = StatisticalSel self.best_rfesel = RFESel self.best_reliefsel = ReliefSel self.best_Sampler = Sampler self.best_estimator_ = fitted_estimator self.best_params_ = parameters_all return self
[docs] def create_ensemble(self, X_train, Y_train, verbose=None, initialize=False, scoring=None, method='top_N', size=50, overfit_scaler=False): """ Create an (optimal) ensemble of a combination of hyperparameter settings and the associated groupsels, PCAs, estimators etc. # The following ensemble methods are supported: # Single: # only use the single best classifier. Performance is computed # using the same predict function as during the optimization # top_N: # make an ensemble of the best N individual classifiers, where N is # given as an input. If N==1, then only the single best classifier is # used, but it is evaluated using predict_proba. # FitNumber: # make an ensemble of the best N individual classifiers, choosing N # that gives the highest performance # ForwardSelection: # add the model that optimizes the total ensemble performance, # then repeat with replacement until there is no more improvement # in performance # Caruana: # for a fixed number of iterations, add the model that optimizes # the total ensemble performance, then choose the ensemble size # which gave the best performance # Bagging: # same as Caruana method, but the final ensemble is a weighted average # of a number of ensembles that each use only a subset of the available # models """ # Define a function for scoring the performance of a classifier def compute_performance(scoring, Y_valid_truth, Y_valid_score): if scoring == 'f1_weighted' or scoring == 'f1': # Convert score to binaries first for num in range(0, len(Y_valid_score)): if Y_valid_score[num] >= 0.5: Y_valid_score[num] = 1 else: Y_valid_score[num] = 0 perf = f1_score(Y_valid_truth, Y_valid_score, average='weighted') elif scoring == 'f1': # Convert score to binaries first for num in range(0, len(Y_valid_score)): if Y_valid_score[num] >= 0.5: Y_valid_score[num] = 1 else: Y_valid_score[num] = 0 perf = f1_score(Y_valid_truth, Y_valid_score, average='macro') elif scoring == 'auc': perf = roc_auc_score(Y_valid_truth, Y_valid_score) elif scoring == 'sar': perf = sar_score(Y_valid_truth, Y_valid_score) else: raise KeyError('[WORC Warning] No valid score method given in ensembling: ' + str(scoring)) return perf if verbose is None: verbose = self.verbose if scoring is None: scoring = self.scoring # Get settings for best estimators parameters_all = self.cv_results_['params'] n_classifiers = len(parameters_all) n_iter = len(self.cv_iter) # Create a new base object for the ensemble components if type(self) == RandomizedSearchCVfastr: base_estimator = RandomizedSearchCVfastr() elif type(self) == RandomizedSearchCVJoblib: base_estimator = RandomizedSearchCVJoblib() elif type(self) == GuidedSearchCVSMAC: base_estimator = GuidedSearchCVSMAC() if method == 'Single': # Do not refit all the classifiers if we only need the best one ensemble = [0] elif method == 'top_N': # Do not refit all the classifiers if we only need the best N ensemble = range(0, size) else: # Refit the models and compute the predictions on the validation sets if verbose: print('\t - Precomputing scores on training and validation set for ensembling.') if self.fitted_validation_workflows: print('\t - Detected already fitted train-val workflows.') # Create the ground truth Y_valid_truth = list() for it, (train, valid) in enumerate(self.cv_iter): Y_valid_truth.append(Y_train[valid]) # Precompute the scores of all estimators performances = list() all_predictions = list() ensemble_configurations = list() prediction_length = len(self.cv_iter[0][1]) for num, p_all in enumerate(parameters_all): performances_iter = list() predictions_iter = np.zeros((n_iter, prediction_length)) for it, (train, valid) in enumerate(self.cv_iter): def getpredictions(): new_estimator = clone(base_estimator) # Fit the preprocessors of the pipeline out = fit_and_score(X_train, Y_train, scoring, train, valid, p_all, return_all=True) (save_data, GroupSel, VarSel, SelectModel, feature_labels, scalers, encoders, Imputers, PCAs, StatisticalSel, RFESel, ReliefSel, Sampler) = out new_estimator.best_groupsel = GroupSel new_estimator.best_scaler = scalers new_estimator.best_varsel = VarSel new_estimator.best_modelsel = SelectModel new_estimator.best_preprocessor = None new_estimator.best_imputer = Imputers new_estimator.best_encoder = encoders new_estimator.best_pca = PCAs new_estimator.best_featlab = feature_labels new_estimator.best_statisticalsel = StatisticalSel new_estimator.best_rfesel = RFESel new_estimator.best_reliefsel = ReliefSel new_estimator.best_Sampler = Sampler # Use the fitted preprocessors to preprocess the features X_train_values = np.asarray([x[0] for x in X_train]) processed_X, processed_y = new_estimator.preprocess(X_train_values[train], Y_train[train], training=True) # Check if there are features left (patients, features_left) = np.shape(processed_X) if features_left == 0: print('no features left' + '\n') # No features are left; do not consider this pipeline for the ensemble return None # Construct and fit the classifier best_estimator = cc.construct_classifier(p_all) best_estimator.fit(processed_X, processed_y) new_estimator.best_estimator_ = best_estimator predictions = new_estimator.predict_proba(X_train_values[valid]) return predictions predictions = list() # Start with storing the ground truth if self.fitted_validation_workflows: # Use already fitted workflow estimator = self.fitted_validation_workflows[num + it * self.maxlen] if estimator is None: # Estimator is none, refit and get predictions predictions = getpredictions() else: X_train_values = np.asarray([x[0] for x in X_train]) try: predictions = estimator.predict_proba(X_train_values[valid]) except (NotFittedError, ValueError, AttributeError): # Estimator cannot be fitted properly, hence skip it predictions = None else: predictions = getpredictions() if predictions is None: # Estimator cannot be fitted properly, hence skip it break # Only take the probabilities for the second class predictions = predictions[:, 1] # Store the predictions on this split predictions_iter[it, :] = predictions # Compute and store the performance on this split performances_iter.append(compute_performance(scoring, Y_train[valid], predictions)) # print('fitandscore: ' + str(out[0][1]) + ' and computed: ' + # str(compute_performance(scoring, Y_train[valid], predictions)) + '\n') # At the end of the last iteration, store the results of this pipeline if it == (n_iter - 1): # Add the pipeline to the list ensemble_configurations.append(p_all) # Store the predictions all_predictions.append(predictions_iter) # Store the performance performances.append(np.mean(performances_iter)) # Update the parameters parameters_all = ensemble_configurations n_classifiers = len(ensemble_configurations) # Construct the array of final predictions base_Y_valid_score = np.zeros((n_iter, n_classifiers, prediction_length)) for iter in range(n_iter): for num in range(n_classifiers): base_Y_valid_score[iter][num] = all_predictions[num][iter] # Create the ensemble using the precomputed scores: # Initialize the ensemble ensemble = list() # Initialize the stacked list of predictions that we keep for the ensemble y_score = [None] * n_iter best_performance = 0 new_performance = 0.001 single_estimator_performance = max(performances) iteration = 0 if method == 'FitNumber': sortedindices = np.argsort(performances)[::-1] performances_n_class = list() if verbose: print("\n") print('Creating ensemble with FitNumber method.') for iteration in range(0, n_classifiers): Y_valid_score = copy.deepcopy(base_Y_valid_score) if iteration > 1: for num in range(0, n_iter): y_score[num] = np.vstack((y_score[num], Y_valid_score[num][ensemble[-1], :])) elif iteration == 1: # Create y_score object for second iteration for num in range(0, n_iter): y_score[num] = Y_valid_score[num][ensemble[-1], :] # Perform n-fold cross validation to estimate performance of next best classifier performances_temp = np.zeros((n_iter)) for n_crossval in range(0, n_iter): # For each estimator, add the score to the ensemble and new ensemble performance if iteration == 0: # No y_score yet, so we need to build it instead of stacking y_valid_score_new = Y_valid_score[n_crossval][sortedindices[iteration], :] else: # Stack scores of added model on top of previous scores and average y_valid_score_new = np.mean(np.vstack((y_score[n_crossval], Y_valid_score[n_crossval][sortedindices[iteration], :])), axis=0) perf = compute_performance(scoring, Y_valid_truth[n_crossval], y_valid_score_new) performances_temp[n_crossval] = perf # Check which estimator should be in the ensemble to maximally improve new_performance = np.mean(performances_temp) performances_n_class.append(new_performance) best_index = sortedindices[iteration] ensemble.append(best_index) # Select N_models for initialization new_performance = max(performances_n_class) N_models = performances_n_class.index(new_performance) + 1 # +1 due to python indexing ensemble = ensemble[0:N_models] best_performance = new_performance self.ensemble_validation_score = best_performance if verbose: print(f"Ensembling best {scoring}: {best_performance}.") print(f"Single estimator best {scoring}: {single_estimator_performance}.") print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.') elif method == 'ForwardSelection': if verbose: print('Creating ensemble with ForwardSelection method.') while new_performance > best_performance: Y_valid_score = copy.deepcopy(base_Y_valid_score) if verbose: print(f"Iteration: {iteration}, best {scoring}: {new_performance}.") best_performance = new_performance if iteration > 1: ensemble.append(best_index) for num in range(0, n_iter): y_score[num] = np.vstack((y_score[num], Y_valid_score[num][ensemble[-1], :])) elif iteration == 1: # Create y_score object for second iteration ensemble.append(best_index) for num in range(0, n_iter): y_score[num] = Y_valid_score[num][ensemble[-1], :] # Perform n-fold cross validation to estimate performance of each possible addition to ensemble performances_temp = np.zeros((n_iter, n_classifiers)) for n_crossval in range(0, n_iter): # For each estimator, add the score to the ensemble and new ensemble performance for n_estimator in range(0, n_classifiers): if iteration == 0: # No y_score yet, so we need to build it instead of stacking y_valid_score_new = Y_valid_score[n_crossval][n_estimator, :] else: # Stack scores of added model on top of previous scores and average y_valid_score_new = np.mean(np.vstack((y_score[n_crossval], Y_valid_score[n_crossval][n_estimator, :])), axis=0) perf = compute_performance(scoring, Y_valid_truth[n_crossval], y_valid_score_new) performances_temp[n_crossval, n_estimator] = perf # Average performances over crossval performances_temp = list(np.mean(performances_temp, axis=0)) # Check which ensemble should be in the ensemble to maximally improve new_performance = max(performances_temp) best_index = performances_temp.index(new_performance) iteration += 1 self.ensemble_validation_score = best_performance if verbose: # Print the performance gain print(f"Ensembling best {scoring}: {best_performance}.") print(f"Single estimator best {scoring}: {single_estimator_performance}.") print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.') elif method == 'Caruana': if verbose: print('Creating ensemble with Caruana method.') best_ensemble_scores = list() while iteration < 20: Y_valid_score = copy.deepcopy(base_Y_valid_score) if verbose: print(f"Iteration: {iteration}, best {scoring}: {new_performance}.") if iteration > 1: # Stack scores: not needed for first iteration for num in range(0, n_iter): y_score[num] = np.vstack((y_score[num], Y_valid_score[num][ensemble[-1], :])) elif iteration == 1: # Create y_score object for second iteration for num in range(0, n_iter): y_score[num] = Y_valid_score[num][ensemble[-1], :] # Perform n-fold cross validation to estimate performance of each possible addition to ensemble performances_temp = np.zeros((n_iter, n_classifiers)) for n_crossval in range(0, n_iter): # For each estimator, add the score to the ensemble and new ensemble performance for n_estimator in range(0, n_classifiers): if iteration == 0: # No y_score yet, so we need to build it instead of stacking y_valid_score_new = Y_valid_score[n_crossval][n_estimator, :] else: # Stack scores of added model on top of previous scores and average y_valid_score_new = np.mean(np.vstack((y_score[n_crossval], Y_valid_score[n_crossval][n_estimator, :])), axis=0) perf = compute_performance(scoring, Y_valid_truth[n_crossval], y_valid_score_new) performances_temp[n_crossval, n_estimator] = perf # Average performances over crossval performances_temp = list(np.mean(performances_temp, axis=0)) # Check which ensemble should be in the ensemble to maximally improve new_performance = max(performances_temp) best_ensemble_scores.append(new_performance) best_index = performances_temp.index(new_performance) ensemble.append(best_index) iteration += 1 # Select the optimal ensemble size optimal_ensemble_performance = max(best_ensemble_scores) optimal_N_models = best_ensemble_scores.index(optimal_ensemble_performance) + 1 ensemble = ensemble[0:optimal_N_models] best_performance = optimal_ensemble_performance self.ensemble_validation_score = best_performance if verbose: # Print the performance gain print(f"Ensembling best {scoring}: {best_performance}.") print(f"Single estimator best {scoring}: {single_estimator_performance}.") print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.') elif method == 'Bagging': if verbose: print('Creating ensemble using Caruana with Bagging method.') nr_of_bagging_iterations = size for bag in range(nr_of_bagging_iterations): bag_ensemble = list() subset_size = int(np.floor(n_classifiers / 2)) model_subset = random.sample(range(n_classifiers), subset_size) best_ensemble_scores = list() iteration = 0 while iteration < 20: Y_valid_score = copy.deepcopy(base_Y_valid_score) # if verbose: # print(f"Iteration: {iteration}, best {scoring}: {new_performance}.") if iteration > 1: for num in range(0, n_iter): y_score[num] = np.vstack((y_score[num], Y_valid_score[num][bag_ensemble[-1], :])) elif iteration == 1: # Create y_score object for second iteration for num in range(0, n_iter): y_score[num] = Y_valid_score[num][bag_ensemble[-1], :] # Perform n-fold cross validation to estimate performance of each possible addition to ensemble performances_temp = np.zeros((n_iter, subset_size)) for n_crossval in range(0, n_iter): # For each estimator, add the score to the ensemble and new ensemble performance estimator_counter = 0 for n_estimator in model_subset: if iteration == 0: # No y_score yet, so we need to build it instead of stacking y_valid_score_new = Y_valid_score[n_crossval][n_estimator, :] else: # Stack scores of added model on top of previous scores and average y_valid_score_new = np.mean(np.vstack((y_score[n_crossval], Y_valid_score[n_crossval][n_estimator, :])), axis=0) perf = compute_performance(scoring, Y_valid_truth[n_crossval], y_valid_score_new) performances_temp[n_crossval, estimator_counter] = perf estimator_counter += 1 # Average performances over crossval performances_temp = list(np.mean(performances_temp, axis=0)) # Check which ensemble should be in the ensemble to maximally improve new_performance = max(performances_temp) best_ensemble_scores.append(new_performance) best_index = performances_temp.index(new_performance) bag_ensemble.append(best_index) iteration += 1 # Select the optimal ensemble size optimal_ensemble_performance = max(best_ensemble_scores) optimal_N_models = best_ensemble_scores.index(optimal_ensemble_performance) + 1 # Add the best ensemble of this bagging iteration to the final ensemble bag_ensemble = bag_ensemble[0:optimal_N_models] for model in bag_ensemble: ensemble.append(model) best_performance = optimal_ensemble_performance self.ensemble_validation_score = best_performance if verbose: # Print the performance gain print(f"Ensembling best {scoring}: {best_performance}.") print(f"Single estimator best {scoring}: {single_estimator_performance}.") print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.') else: print(f'[WORC WARNING] No valid ensemble method given: {method}. Not ensembling') return self # Create the ensemble -------------------------------------------------- # First create and score the ensemble on the validation set # If we only want the best solution, we use the score from cv_results_ # For not Single or Top_N, the score has already been computed during fitting if method == 'Single': self.ensemble_validation_score = self.cv_results_['mean_test_score'][0] elif method == 'top_N': self.ensemble_validation_score = np.mean([self.cv_results_['mean_test_score'][i] for i in ensemble]) if verbose: print('Final ensemble validation score: ' + str(self.ensemble_validation_score)) # Create the ensemble -------------------------------------------------- train = np.arange(0, len(X_train)) if self.fitted_workflows: # Simply select the required estimators print('\t - Detected already fitted train-test workflows.') estimators = list() for enum in ensemble: try: # Try a prediction to see if estimator is truly fitted self.fitted_workflows[enum].predict(np.asarray([X_train[0][0], X_train[1][0]])) estimators.append(self.fitted_workflows[enum]) except (NotFittedError, ValueError, AttributeError): print(f'\t\t - Estimator {enum} not fitted (correctly) yet, refit.') if self.fitted_workflows[enum] is not None: estimator = self.fitted_workflows[enum] else: estimator = clone(base_estimator) estimator.refit_and_score(X_train, Y_train, parameters_all[enum], train, train) try: # Try a prediction to see if estimator is truly fitted estimator.predict(np.asarray([X_train[0][0], X_train[1][0]])) estimators.append(estimator) except (NotFittedError, ValueError): print(f'\t\t - Estimator {enum} could not be fitted (correctly), do not include in ensemble.') else: # Create the ensemble trained on the full training set parameters_all = [parameters_all[i] for i in ensemble] estimators = list() nest = len(ensemble) for enum, p_all in enumerate(parameters_all): # Refit a SearchCV object with the provided parameters if verbose: print(f"Refitting estimator {enum + 1} / {nest}.") base_estimator = clone(base_estimator) # Check if we need to create a multiclass estimator base_estimator.refit_and_score(X_train, Y_train, p_all, train, train, verbose=False) # Determine whether to overfit the feature scaling on the test set base_estimator.overfit_scaler = overfit_scaler try: # Try a prediction to see if estimator is truly fitted base_estimator.predict(np.asarray([X_train[0][0], X_train[1][0]])) estimators.append(base_estimator) except (NotFittedError, ValueError): print(f'\t\t - Estimator {enum} could not be fitted (correctly), do not include in ensemble.') if not estimators: print(f'\t\t - Ensemble is empty, thus go on untill we find an estimator that works and that is the final ensemble.') while not estimators: # We cannot have an empy ensemble, thus go on untill we find an estimator that works enum += 1 p_all = self.cv_results_['params'][enum] # Refit a SearchCV object with the provided parameters base_estimator = clone(base_estimator) # Check if we need to create a multiclass estimator base_estimator.refit_and_score(X_train, Y_train, p_all, train, train, verbose=False) # Determine whether to overfit the feature scaling on the test set base_estimator.overfit_scaler = overfit_scaler try: # Try a prediction to see if estimator is truly fitted base_estimator.predict(np.asarray([X_train[0][0], X_train[1][0]])) estimators.append(base_estimator) except (NotFittedError, ValueError): pass print(f'\t\t - Needed estimator {enum}.') self.ensemble = Ensemble(estimators) self.best_estimator_ = self.ensemble print("\n")
[docs]class BaseSearchCVfastr(BaseSearchCV): """Base class for hyper parameter search with cross-validation.""" def _fit(self, X, y, groups, parameter_iterable): """Actual fitting, performing the search over parameters.""" regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet'] isclassifier =\ not any(clf in regressors for clf in self.param_distributions['classifiers']) # Check the cross-validation object and do the splitting cv = check_cv(self.cv, y, classifier=isclassifier) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print(f"Fitting {n_splits} folds for each of {n_candidates} candidates, totalling {n_candidates * n_splits} fits.") cv_iter = list(cv.split(X, y, groups)) # NOTE: We do not check the scoring here, as this can differ # per estimator. Thus, this is done inside the fit and scoring # Check fitting parameters fit_params = _check_fit_params(X, self.fit_params) # Create temporary directory for fastr if DebugDetector().do_detection(): # Specific name for easy debugging debugnum = 0 name = 'DEBUG_' + str(debugnum) tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name) while os.path.exists(tempfolder): debugnum += 1 name = 'DEBUG_' + str(debugnum) tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name) else: name = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name) if not os.path.exists(tempfolder): os.makedirs(tempfolder) # Draw parameter sample for num, parameters in enumerate(parameter_iterable): parameter_sample = parameters break # Preprocess features if required if 'FeatPreProcess' in parameter_sample: if parameter_sample['FeatPreProcess'] == 'True': print("Preprocessing features.") feature_values = np.asarray([x[0] for x in X]) feature_labels = np.asarray([x[1] for x in X]) preprocessor = Preprocessor(verbose=False) preprocessor.fit(feature_values, feature_labels=feature_labels[0, :]) feature_values = preprocessor.transform(feature_values) feature_labels = preprocessor.transform(feature_labels) X = [(values, labels) for values, labels in zip(feature_values, feature_labels)] # Create the parameter files parameters_temp = dict() try: for num, parameters in enumerate(parameter_iterable): parameters["Number"] = str(num) parameters_temp[str(num)] = parameters except ValueError: # One of the parameters gives an error. Find out which one. param_grid = dict() for k, v in parameter_iterable.param_distributions.iteritems(): param_grid[k] = v sampled_params = ParameterSampler(param_grid, 5) try: for num, parameters in enumerate(sampled_params): # Dummy operation a = 1 except ValueError: break message = 'One or more of the values in your parameter sampler ' +\ 'is either not iterable, or the distribution cannot ' +\ 'generate valid samples. Please check your ' +\ f' parameters. At least {k} gives an error.' raise WORCexceptions.WORCValueError(message) # Split the parameters files in equal parts keys = list(parameters_temp.keys()) keys = chunks(keys, self.n_jobspercore) parameter_files = dict() for num, k in enumerate(keys): temp_dict = dict() for number in k: temp_dict[number] = parameters_temp[number] fname = f'settings_{num}.json' sourcename = os.path.join(tempfolder, 'parameters', fname) if not os.path.exists(os.path.dirname(sourcename)): os.makedirs(os.path.dirname(sourcename)) with open(sourcename, 'w') as fp: json.dump(temp_dict, fp, indent=4) parameter_files[str(num).zfill(4)] =\ f'vfs://tmp/GS/{name}/parameters/{fname}' # Create test-train splits traintest_files = dict() # TODO: ugly nummering solution num = 0 for train, test in cv_iter: source_labels = ['train', 'test'] source_data = pd.Series([train, test], index=source_labels, name='Train-test data') fname = f'traintest_{num}.hdf5' sourcename = os.path.join(tempfolder, 'traintest', fname) if not os.path.exists(os.path.dirname(sourcename)): os.makedirs(os.path.dirname(sourcename)) traintest_files[str(num).zfill(4)] = f'vfs://tmp/GS/{name}/traintest/{fname}' sourcelabel = f"Source Data Iteration {num}" source_data.to_hdf(sourcename, sourcelabel) num += 1 # Create the files containing the estimator and settings estimator_labels = ['X', 'y', 'scoring', 'verbose', 'fit_params', 'return_train_score', 'return_n_test_samples', 'return_times', 'return_parameters', 'return_estimator', 'error_score', 'return_all', 'refit_training_workflows', 'refit_validation_workflows'] verbose = False return_n_test_samples = True return_times = True return_parameters = False return_estimator = False return_all = False estimator_data = pd.Series([X, y, self.scoring, verbose, fit_params, self.return_train_score, return_n_test_samples, return_times, return_parameters, return_estimator, self.error_score, return_all, self.refit_training_workflows, self.refit_validation_workflows], index=estimator_labels, name='estimator Data') fname = 'estimatordata.hdf5' estimatorname = os.path.join(tempfolder, fname) estimator_data.to_hdf(estimatorname, 'Estimator Data') estimatordata = f"vfs://tmp/GS/{name}/{fname}" # Create the fastr network network = fastr.create_network('WORC_CASH_' + name) estimator_data = network.create_source('HDF5', id='estimator_source', resources=ResourceLimit(memory='4G')) traintest_data = network.create_source('HDF5', id='traintest', resources=ResourceLimit(memory='4G')) parameter_data = network.create_source('JsonFile', id='parameters', resources=ResourceLimit(memory='4G')) sink_output = network.create_sink('HDF5', id='output', resources=ResourceLimit(memory='6G')) fitandscore =\ network.create_node('worc/fitandscore:1.0', tool_version='1.0', id='fitandscore', resources=ResourceLimit(memory=self.memory)) fitandscore.inputs['estimatordata'].input_group = 'estimator' fitandscore.inputs['traintest'].input_group = 'traintest' fitandscore.inputs['parameters'].input_group = 'parameters' fitandscore.inputs['estimatordata'] = estimator_data.output fitandscore.inputs['traintest'] = traintest_data.output fitandscore.inputs['parameters'] = parameter_data.output sink_output.input = fitandscore.outputs['fittedestimator'] source_data = {'estimator_source': estimatordata, 'traintest': traintest_files, 'parameters': parameter_files} sink_data = {'output': f"vfs://tmp/GS/{name}/output_{{sample_id}}_{{cardinality}}{{ext}}"} network.execute(source_data, sink_data, tmpdir=os.path.join(tempfolder, 'tmp'), execution_plugin=self.fastr_plugin) # Check whether all jobs have finished expected_no_files = len(list(traintest_files.keys())) * len(list(parameter_files.keys())) sink_files = glob.glob(os.path.join(fastr.config.mounts['tmp'], 'GS', name) + '/output*.hdf5') sink_files.sort() if len(sink_files) != expected_no_files: difference = expected_no_files - len(sink_files) fname = os.path.join(tempfolder, 'tmp') message = ('Fitting classifiers has failed for ' + f'{difference} / {expected_no_files} files. The temporary ' + f'results where not deleted and can be found in {tempfolder}. ' + 'Probably your fitting and scoring failed: check out ' + 'the tmp/fitandscore folder within the tempfolder for ' + 'the fastr job temporary results or run: fastr trace ' + f'"{fname}{os.path.sep}__sink_data__.json" --samples.') raise WORCexceptions.WORCValueError(message) # Read in the output data once finished save_data = list() for output in sink_files: data = pd.read_hdf(output) save_data.extend(list(data['RET'])) # if one choose to see train score, "out" will contain train score info if self.return_train_score: if self.refit_training_workflows: if self.refit_validation_workflows: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters_all, fitted_workflows, fitted_validation_workflows) =\ zip(*save_data) else: fitted_validation_workflows = None (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters_all, fitted_workflows) =\ zip(*save_data) else: fitted_workflows = None if self.refit_validation_workflows: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters_all, fitted_validation_workflows) =\ zip(*save_data) else: fitted_validation_workflows = None (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters_all) =\ zip(*save_data) else: if self.refit_training_workflows: if self.refit_validation_workflows: (test_scores, test_sample_counts, fit_time, score_time, parameters_all, fitted_workflows, fitted_validation_workflows) =\ zip(*save_data) else: fitted_validation_workflows = None (test_scores, test_sample_counts, fit_time, score_time, parameters_all, fitted_workflows) =\ zip(*save_data) else: fitted_workflows = None if self.refit_validation_workflows: (test_scores, test_sample_counts, fit_time, score_time, parameters_all, fitted_validation_workflows) =\ zip(*save_data) else: (test_scores, test_sample_counts, fit_time, score_time, parameters_all) =\ zip(*save_data) # Remove the temporary folder used if name != 'DEBUG_0': # Do delete if not debugging for first iteration shutil.rmtree(tempfolder) # Process the results of the fitting procedure self.process_fit(n_splits=n_splits, parameters_all=parameters_all, test_sample_counts=test_sample_counts, test_score_dicts=test_scores, train_score_dicts=train_scores, fit_time=fit_time, score_time=score_time, cv_iter=cv_iter, X=X, y=y, fitted_workflows=fitted_workflows, fitted_validation_workflows=fitted_validation_workflows)
[docs]class RandomizedSearchCVfastr(BaseSearchCVfastr): """Randomized search on hyper parameters. RandomizedSearchCV implements a "fit" and a "score" method. It also implements "predict", "predict_proba", "decision_function", "transform" and "inverse_transform" if they are implemented in the estimator used. The parameters of the estimator used to apply these methods are optimized by cross-validated search over parameter settings. In contrast to GridSearchCV, not all parameter values are tried out, but rather a fixed number of parameter settings is sampled from the specified distributions. The number of parameter settings that are tried is given by n_iter. If all parameters are presented as a list, sampling without replacement is performed. If at least one parameter is given as a distribution, sampling with replacement is used. It is highly recommended to use continuous distributions for continuous parameters. Read more in the sklearn user guide. Parameters ---------- estimator : estimator object. A object of that type is instantiated for each grid point. This is assumed to implement the scikit-learn estimator interface. Either estimator needs to provide a ``score`` function, or ``scoring`` must be passed. param_distributions : dict Dictionary with parameters names (string) as keys and distributions or lists of parameters to try. Distributions must provide a ``rvs`` method for sampling (such as those from scipy.stats.distributions). If a list is given, it is sampled uniformly. n_iter : int, default=10 Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution. scoring : string, callable or None, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. If ``None``, the ``score`` method of the estimator is used. fit_params : dict, optional Parameters to pass to the fit method. n_jobs : int, default=1 Number of jobs to run in parallel. pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' iid : boolean, default=True If True, the data is assumed to be identically distributed across the folds, and the loss minimized is the total loss per sample, and not the mean loss across the folds. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. Refer the sklearn user guide for the various cross-validation strategies that can be used here. refit : boolean, default=True Refit the best estimator with the entire dataset. If "False", it is impossible to make predictions using this RandomizedSearchCV instance after fitting. verbose : integer Controls the verbosity: the higher, the more messages. random_state : int or RandomState Pseudo random number generator state used for random uniform sampling from lists of possible values instead of scipy.stats distributions. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. return_train_score : boolean, default=True If ``'False'``, the ``cv_results_`` attribute will not include training scores. Attributes ---------- cv_results_ : dict of numpy (masked) ndarrays A dict with keys as column headers and values as columns, that can be imported into a pandas ``DataFrame``. For instance the below given table +--------------+-------------+-------------------+---+---------------+ | param_kernel | param_gamma | split0_test_score |...|rank_test_score| +==============+=============+===================+===+===============+ | 'rbf' | 0.1 | 0.8 |...| 2 | +--------------+-------------+-------------------+---+---------------+ | 'rbf' | 0.2 | 0.9 |...| 1 | +--------------+-------------+-------------------+---+---------------+ | 'rbf' | 0.3 | 0.7 |...| 1 | +--------------+-------------+-------------------+---+---------------+ will be represented by a ``cv_results_`` dict of:: { 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'], mask = False), 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False), 'split0_test_score' : [0.8, 0.9, 0.7], 'split1_test_score' : [0.82, 0.5, 0.7], 'mean_test_score' : [0.81, 0.7, 0.7], 'std_test_score' : [0.02, 0.2, 0.], 'rank_test_score' : [3, 1, 1], 'split0_train_score' : [0.8, 0.9, 0.7], 'split1_train_score' : [0.82, 0.5, 0.7], 'mean_train_score' : [0.81, 0.7, 0.7], 'std_train_score' : [0.03, 0.03, 0.04], 'mean_fit_time' : [0.73, 0.63, 0.43, 0.49], 'std_fit_time' : [0.01, 0.02, 0.01, 0.01], 'mean_score_time' : [0.007, 0.06, 0.04, 0.04], 'std_score_time' : [0.001, 0.002, 0.003, 0.005], 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...], } NOTE that the key ``'params'`` is used to store a list of parameter settings dict for all the parameter candidates. The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and ``std_score_time`` are all in seconds. best_estimator_ : estimator Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data. Not available if refit=False. best_score_ : float Score of best_estimator on the left out data. best_params_ : dict Parameter setting that gave the best results on the hold out data. best_index_ : int The index (of the ``cv_results_`` arrays) which corresponds to the best candidate parameter setting. The dict at ``search.cv_results_['params'][search.best_index_]`` gives the parameter setting for the best model, that gives the highest mean score (``search.best_score_``). scorer_ : function Scorer function used on the held out data to choose the best parameters for the model. n_splits_ : int The number of cross-validation splits (folds/iterations). Notes ----- The parameters selected are those that maximize the score of the held-out data, according to the scoring parameter. If `n_jobs` was set to a value higher than one, the data is copied for each parameter setting(and not `n_jobs` times). This is done for efficiency reasons if individual jobs take very little time, but may raise errors if the dataset is large and not enough memory is available. A workaround in this case is to set `pre_dispatch`. Then, the memory is copied only `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * n_jobs`. See Also -------- :class:`GridSearchCV`: Does exhaustive search over a grid of parameters. :class:`ParameterSampler`: A generator over parameter settings, constructed from param_distributions. """
[docs] def __init__(self, param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, fastr_plugin=None, memory='2G', maxlen=100, ranking_score='test_score', refit_training_workflows=False, refit_validation_workflows=False): super(RandomizedSearchCVfastr, self).__init__( param_distributions=param_distributions, scoring=scoring, fit_params=fit_params, n_iter=n_iter, random_state=random_state, n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score, n_jobspercore=n_jobspercore, fastr_plugin=fastr_plugin, memory=memory, maxlen=maxlen, ranking_score=ranking_score, refit_training_workflows=refit_training_workflows, refit_validation_workflows=refit_validation_workflows)
[docs] def fit(self, X, y=None, groups=None): """Randomized model selection and hyperparameter search. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. """ print("Fit: " + str(self.n_iter)) sampled_params = ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state) return self._fit(X, y, groups, sampled_params)
[docs]class BaseSearchCVJoblib(BaseSearchCV): """Base class for hyper parameter search with cross-validation.""" def _fit(self, X, y, groups, parameter_iterable): """Actual fitting, performing the search over parameters.""" regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet'] isclassifier =\ not any(clf in regressors for clf in self.param_distributions['classifiers']) # Check the cross-validation object and do the splitting cv = check_cv(self.cv, y, classifier=isclassifier) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print(f"Fitting {n_splits} folds for each of {n_candidates}" +\ " candidates, totalling" +\ " {n_candidates * n_splits} fits") pre_dispatch = self.pre_dispatch cv_iter = list(cv.split(X, y, groups)) # Check fitting parameters fit_params = _check_fit_params(X, self.fit_params) # Draw parameter sample for num, parameters in enumerate(parameter_iterable): parameter_sample = parameters break # Preprocess features if required if 'FeatPreProcess' in parameter_sample: if parameter_sample['FeatPreProcess'] == 'True': print("Preprocessing features.") feature_values = np.asarray([x[0] for x in X]) feature_labels = np.asarray([x[1] for x in X]) preprocessor = Preprocessor(verbose=False) preprocessor.fit(feature_values, feature_labels=feature_labels[0, :]) feature_values = preprocessor.transform(feature_values) feature_labels = preprocessor.transform(feature_labels) X = [(values, labels) for values, labels in zip(feature_values, feature_labels)] out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )(delayed(fit_and_score)(X, y, self.scoring, train, test, parameters, fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=False, return_estimator=False, error_score=self.error_score, verbose=False, return_all=False) for parameters in parameter_iterable for train, test in cv_iter) save_data = zip(*out) # if one choose to see train score, "out" will contain train score info if self.return_train_score: if self.refit_training_workflows: if self.refit_validation_workflows: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters_all, fitted_workflows, fitted_validation_workflows) =\ zip(*save_data) else: fitted_validation_workflows = None (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters_all, fitted_workflows) =\ zip(*save_data) else: fitted_workflows = None if self.refit_validation_workflows: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters_all, fitted_validation_workflows) =\ zip(*save_data) else: fitted_validation_workflows = None (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters_all) =\ zip(*save_data) else: if self.refit_training_workflows: if self.refit_validation_workflows: (test_scores, test_sample_counts, fit_time, score_time, parameters_all, fitted_workflows, fitted_validation_workflows) =\ zip(*save_data) else: fitted_validation_workflows = None (test_scores, test_sample_counts, fit_time, score_time, parameters_all, fitted_workflows) =\ zip(*save_data) else: fitted_workflows = None if self.refit_validation_workflows: (test_scores, test_sample_counts, fit_time, score_time, parameters_all, fitted_validation_workflows) =\ zip(*save_data) else: (test_scores, test_sample_counts, fit_time, score_time, parameters_all) =\ zip(*save_data) self.process_fit(n_splits=n_splits, parameters_all=parameters_all, test_sample_counts=test_sample_counts, test_score_dicts=test_scores, train_score_dicts=train_scores, fit_time=fit_time, score_time=score_time, cv_iter=cv_iter, X=X, y=y, fitted_workflows=fitted_workflows, fitted_validation_workflows=fitted_validation_workflows) return self
[docs]class GridSearchCVfastr(BaseSearchCVfastr): """Exhaustive search over specified parameter values for an estimator. Important members are fit, predict. GridSearchCV implements a "fit" and a "score" method. It also implements "predict", "predict_proba", "decision_function", "transform" and "inverse_transform" if they are implemented in the estimator used. The parameters of the estimator used to apply these methods are optimized by cross-validated grid-search over a parameter grid. Read more in the sklearn user guide. Parameters ---------- estimator : estimator object. This is assumed to implement the scikit-learn estimator interface. Either estimator needs to provide a ``score`` function, or ``scoring`` must be passed. param_grid : dict or list of dictionaries Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of parameter settings. scoring : string, callable or None, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. If ``None``, the ``score`` method of the estimator is used. fit_params : dict, optional Parameters to pass to the fit method. n_jobs : int, default=1 Number of jobs to run in parallel. pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' iid : boolean, default=True If True, the data is assumed to be identically distributed across the folds, and the loss minimized is the total loss per sample, and not the mean loss across the folds. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. Refer the sklearn user guide for the various cross-validation strategies that can be used here. refit : boolean, default=True Refit the best estimator with the entire dataset. If "False", it is impossible to make predictions using this GridSearchCV instance after fitting. verbose : integer Controls the verbosity: the higher, the more messages. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. return_train_score : boolean, default=True If ``'False'``, the ``cv_results_`` attribute will not include training scores. Examples -------- >>> from sklearn import svm, datasets >>> from sklearn.model_selection import GridSearchCV >>> iris = datasets.load_iris() >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} >>> svr = svm.SVC() >>> clf = GridSearchCV(svr, parameters) >>> clf.fit(iris.data, iris.target) ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS GridSearchCV(cv=None, error_score=..., estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=..., decision_function_shape=None, degree=..., gamma=..., kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=..., verbose=False), fit_params={}, iid=..., n_jobs=1, param_grid=..., pre_dispatch=..., refit=..., return_train_score=..., scoring=..., verbose=...) >>> sorted(clf.cv_results_.keys()) ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS ['mean_fit_time', 'mean_score_time', 'mean_test_score',... 'mean_train_score', 'param_C', 'param_kernel', 'params',... 'rank_test_score', 'split0_test_score',... 'split0_train_score', 'split1_test_score', 'split1_train_score',... 'split2_test_score', 'split2_train_score',... 'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score'...] Attributes ---------- cv_results_ : dict of numpy (masked) ndarrays A dict with keys as column headers and values as columns, that can be imported into a pandas ``DataFrame``. For instance the below given table +------------+-----------+------------+-----------------+---+---------+ |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_....| +============+===========+============+=================+===+=========+ | 'poly' | -- | 2 | 0.8 |...| 2 | +------------+-----------+------------+-----------------+---+---------+ | 'poly' | -- | 3 | 0.7 |...| 4 | +------------+-----------+------------+-----------------+---+---------+ | 'rbf' | 0.1 | -- | 0.8 |...| 3 | +------------+-----------+------------+-----------------+---+---------+ | 'rbf' | 0.2 | -- | 0.9 |...| 1 | +------------+-----------+------------+-----------------+---+---------+ will be represented by a ``cv_results_`` dict of:: { 'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'], mask = [False False False False]...) 'param_gamma': masked_array(data = [-- -- 0.1 0.2], mask = [ True True False False]...), 'param_degree': masked_array(data = [2.0 3.0 -- --], mask = [False False True True]...), 'split0_test_score' : [0.8, 0.7, 0.8, 0.9], 'split1_test_score' : [0.82, 0.5, 0.7, 0.78], 'mean_test_score' : [0.81, 0.60, 0.75, 0.82], 'std_test_score' : [0.02, 0.01, 0.03, 0.03], 'rank_test_score' : [2, 4, 3, 1], 'split0_train_score' : [0.8, 0.9, 0.7], 'split1_train_score' : [0.82, 0.5, 0.7], 'mean_train_score' : [0.81, 0.7, 0.7], 'std_train_score' : [0.03, 0.03, 0.04], 'mean_fit_time' : [0.73, 0.63, 0.43, 0.49], 'std_fit_time' : [0.01, 0.02, 0.01, 0.01], 'mean_score_time' : [0.007, 0.06, 0.04, 0.04], 'std_score_time' : [0.001, 0.002, 0.003, 0.005], 'params' : [{'kernel': 'poly', 'degree': 2}, ...], } NOTE that the key ``'params'`` is used to store a list of parameter settings dict for all the parameter candidates. The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and ``std_score_time`` are all in seconds. best_estimator_ : estimator Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data. Not available if refit=False. best_score_ : float Score of best_estimator on the left out data. best_params_ : dict Parameter setting that gave the best results on the hold out data. best_index_ : int The index (of the ``cv_results_`` arrays) which corresponds to the best candidate parameter setting. The dict at ``search.cv_results_['params'][search.best_index_]`` gives the parameter setting for the best model, that gives the highest mean score (``search.best_score_``). scorer_ : function Scorer function used on the held out data to choose the best parameters for the model. n_splits_ : int The number of cross-validation splits (folds/iterations). Notes ------ The parameters selected are those that maximize the score of the left out data, unless an explicit score is passed in which case it is used instead. If `n_jobs` was set to a value higher than one, the data is copied for each point in the grid (and not `n_jobs` times). This is done for efficiency reasons if individual jobs take very little time, but may raise errors if the dataset is large and not enough memory is available. A workaround in this case is to set `pre_dispatch`. Then, the memory is copied only `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * n_jobs`. See Also --------- :class:`ParameterGrid`: generates all the combinations of a hyperparameter grid. :func:`sklearn.model_selection.train_test_split`: utility function to split the data into a development set usable for fitting a GridSearchCV instance and an evaluation set for its final evaluation. :func:`sklearn.metrics.make_scorer`: Make a scorer from a performance metric or loss function. """
[docs] def __init__(self, estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score=True): super(GridSearchCVfastr, self).__init__( scoring=scoring, fit_params=fit_params, n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score, fastr_plugin=None, memory='2G') self.param_grid = param_grid _check_param_grid(param_grid)
[docs] def fit(self, X, y=None, groups=None): """Run fit with all sets of parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. """ return self._fit(X, y, groups, ParameterGrid(self.param_grid))
[docs]class RandomizedSearchCVJoblib(BaseSearchCVJoblib): """Randomized search on hyper parameters. RandomizedSearchCV implements a "fit" and a "score" method. It also implements "predict", "predict_proba", "decision_function", "transform" and "inverse_transform" if they are implemented in the estimator used. The parameters of the estimator used to apply these methods are optimized by cross-validated search over parameter settings. In contrast to GridSearchCV, not all parameter values are tried out, but rather a fixed number of parameter settings is sampled from the specified distributions. The number of parameter settings that are tried is given by n_iter. If all parameters are presented as a list, sampling without replacement is performed. If at least one parameter is given as a distribution, sampling with replacement is used. It is highly recommended to use continuous distributions for continuous parameters. Read more in the sklearn user guide. Parameters ---------- estimator : estimator object. A object of that type is instantiated for each grid point. This is assumed to implement the scikit-learn estimator interface. Either estimator needs to provide a ``score`` function, or ``scoring`` must be passed. param_distributions : dict Dictionary with parameters names (string) as keys and distributions or lists of parameters to try. Distributions must provide a ``rvs`` method for sampling (such as those from scipy.stats.distributions). If a list is given, it is sampled uniformly. n_iter : int, default=10 Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution. scoring : string, callable or None, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. If ``None``, the ``score`` method of the estimator is used. fit_params : dict, optional Parameters to pass to the fit method. n_jobs : int, default=1 Number of jobs to run in parallel. pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' iid : boolean, default=True If True, the data is assumed to be identically distributed across the folds, and the loss minimized is the total loss per sample, and not the mean loss across the folds. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. Refer sklearn user guide for the various cross-validation strategies that can be used here. refit : boolean, default=True Refit the best estimator with the entire dataset. If "False", it is impossible to make predictions using this RandomizedSearchCV instance after fitting. verbose : integer Controls the verbosity: the higher, the more messages. random_state : int or RandomState Pseudo random number generator state used for random uniform sampling from lists of possible values instead of scipy.stats distributions. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. return_train_score : boolean, default=True If ``'False'``, the ``cv_results_`` attribute will not include training scores. Attributes ---------- cv_results_ : dict of numpy (masked) ndarrays A dict with keys as column headers and values as columns, that can be imported into a pandas ``DataFrame``. For instance the below given table +--------------+-------------+-------------------+---+---------------+ | param_kernel | param_gamma | split0_test_score |...|rank_test_score| +==============+=============+===================+===+===============+ | 'rbf' | 0.1 | 0.8 |...| 2 | +--------------+-------------+-------------------+---+---------------+ | 'rbf' | 0.2 | 0.9 |...| 1 | +--------------+-------------+-------------------+---+---------------+ | 'rbf' | 0.3 | 0.7 |...| 1 | +--------------+-------------+-------------------+---+---------------+ will be represented by a ``cv_results_`` dict of:: { 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'], mask = False), 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False), 'split0_test_score' : [0.8, 0.9, 0.7], 'split1_test_score' : [0.82, 0.5, 0.7], 'mean_test_score' : [0.81, 0.7, 0.7], 'std_test_score' : [0.02, 0.2, 0.], 'rank_test_score' : [3, 1, 1], 'split0_train_score' : [0.8, 0.9, 0.7], 'split1_train_score' : [0.82, 0.5, 0.7], 'mean_train_score' : [0.81, 0.7, 0.7], 'std_train_score' : [0.03, 0.03, 0.04], 'mean_fit_time' : [0.73, 0.63, 0.43, 0.49], 'std_fit_time' : [0.01, 0.02, 0.01, 0.01], 'mean_score_time' : [0.007, 0.06, 0.04, 0.04], 'std_score_time' : [0.001, 0.002, 0.003, 0.005], 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...], } NOTE that the key ``'params'`` is used to store a list of parameter settings dict for all the parameter candidates. The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and ``std_score_time`` are all in seconds. best_estimator_ : estimator Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data. Not available if refit=False. best_score_ : float Score of best_estimator on the left out data. best_params_ : dict Parameter setting that gave the best results on the hold out data. best_index_ : int The index (of the ``cv_results_`` arrays) which corresponds to the best candidate parameter setting. The dict at ``search.cv_results_['params'][search.best_index_]`` gives the parameter setting for the best model, that gives the highest mean score (``search.best_score_``). scorer_ : function Scorer function used on the held out data to choose the best parameters for the model. n_splits_ : int The number of cross-validation splits (folds/iterations). Notes ----- The parameters selected are those that maximize the score of the held-out data, according to the scoring parameter. If `n_jobs` was set to a value higher than one, the data is copied for each parameter setting(and not `n_jobs` times). This is done for efficiency reasons if individual jobs take very little time, but may raise errors if the dataset is large and not enough memory is available. A workaround in this case is to set `pre_dispatch`. Then, the memory is copied only `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * n_jobs`. See Also -------- :class:`GridSearchCV`: Does exhaustive search over a grid of parameters. :class:`ParameterSampler`: A generator over parameter settins, constructed from param_distributions. """
[docs] def __init__(self, param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, maxlen=100, ranking_score='test_score'): super(RandomizedSearchCVJoblib, self).__init__( param_distributions=param_distributions, n_iter=n_iter, scoring=scoring, fit_params=fit_params, n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score, n_jobspercore=n_jobspercore, random_state=random_state, maxlen=maxlen, ranking_score=ranking_score)
[docs] def fit(self, X, y=None, groups=None): """Run fit on the estimator with randomly drawn parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. """ sampled_params = ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state) return self._fit(X, y, groups, sampled_params)
[docs]class GridSearchCVJoblib(BaseSearchCVJoblib): """Exhaustive search over specified parameter values for an estimator. Important members are fit, predict. GridSearchCV implements a "fit" and a "score" method. It also implements "predict", "predict_proba", "decision_function", "transform" and "inverse_transform" if they are implemented in the estimator used. The parameters of the estimator used to apply these methods are optimized by cross-validated grid-search over a parameter grid. Read more in the sklearn user guide. Parameters ---------- estimator : estimator object. This is assumed to implement the scikit-learn estimator interface. Either estimator needs to provide a ``score`` function, or ``scoring`` must be passed. param_grid : dict or list of dictionaries Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of parameter settings. scoring : string, callable or None, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. If ``None``, the ``score`` method of the estimator is used. fit_params : dict, optional Parameters to pass to the fit method. n_jobs : int, default=1 Number of jobs to run in parallel. pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' iid : boolean, default=True If True, the data is assumed to be identically distributed across the folds, and the loss minimized is the total loss per sample, and not the mean loss across the folds. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. Refer sklearn user guide for the various cross-validation strategies that can be used here. refit : boolean, default=True Refit the best estimator with the entire dataset. If "False", it is impossible to make predictions using this GridSearchCV instance after fitting. verbose : integer Controls the verbosity: the higher, the more messages. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. return_train_score : boolean, default=True If ``'False'``, the ``cv_results_`` attribute will not include training scores. Examples -------- >>> from sklearn import svm, datasets >>> from sklearn.model_selection import GridSearchCV >>> iris = datasets.load_iris() >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} >>> svr = svm.SVC() >>> clf = GridSearchCV(svr, parameters) >>> clf.fit(iris.data, iris.target) ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS GridSearchCV(cv=None, error_score=..., estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=..., decision_function_shape=None, degree=..., gamma=..., kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=..., verbose=False), fit_params={}, iid=..., n_jobs=1, param_grid=..., pre_dispatch=..., refit=..., return_train_score=..., scoring=..., verbose=...) >>> sorted(clf.cv_results_.keys()) ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS ['mean_fit_time', 'mean_score_time', 'mean_test_score',... 'mean_train_score', 'param_C', 'param_kernel', 'params',... 'rank_test_score', 'split0_test_score',... 'split0_train_score', 'split1_test_score', 'split1_train_score',... 'split2_test_score', 'split2_train_score',... 'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score'...] Attributes ---------- cv_results_ : dict of numpy (masked) ndarrays A dict with keys as column headers and values as columns, that can be imported into a pandas ``DataFrame``. For instance the below given table +------------+-----------+------------+-----------------+---+---------+ |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_....| +============+===========+============+=================+===+=========+ | 'poly' | -- | 2 | 0.8 |...| 2 | +------------+-----------+------------+-----------------+---+---------+ | 'poly' | -- | 3 | 0.7 |...| 4 | +------------+-----------+------------+-----------------+---+---------+ | 'rbf' | 0.1 | -- | 0.8 |...| 3 | +------------+-----------+------------+-----------------+---+---------+ | 'rbf' | 0.2 | -- | 0.9 |...| 1 | +------------+-----------+------------+-----------------+---+---------+ will be represented by a ``cv_results_`` dict of:: { 'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'], mask = [False False False False]...) 'param_gamma': masked_array(data = [-- -- 0.1 0.2], mask = [ True True False False]...), 'param_degree': masked_array(data = [2.0 3.0 -- --], mask = [False False True True]...), 'split0_test_score' : [0.8, 0.7, 0.8, 0.9], 'split1_test_score' : [0.82, 0.5, 0.7, 0.78], 'mean_test_score' : [0.81, 0.60, 0.75, 0.82], 'std_test_score' : [0.02, 0.01, 0.03, 0.03], 'rank_test_score' : [2, 4, 3, 1], 'split0_train_score' : [0.8, 0.9, 0.7], 'split1_train_score' : [0.82, 0.5, 0.7], 'mean_train_score' : [0.81, 0.7, 0.7], 'std_train_score' : [0.03, 0.03, 0.04], 'mean_fit_time' : [0.73, 0.63, 0.43, 0.49], 'std_fit_time' : [0.01, 0.02, 0.01, 0.01], 'mean_score_time' : [0.007, 0.06, 0.04, 0.04], 'std_score_time' : [0.001, 0.002, 0.003, 0.005], 'params' : [{'kernel': 'poly', 'degree': 2}, ...], } NOTE that the key ``'params'`` is used to store a list of parameter settings dict for all the parameter candidates. The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and ``std_score_time`` are all in seconds. best_estimator_ : estimator Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data. Not available if refit=False. best_score_ : float Score of best_estimator on the left out data. best_params_ : dict Parameter setting that gave the best results on the hold out data. best_index_ : int The index (of the ``cv_results_`` arrays) which corresponds to the best candidate parameter setting. The dict at ``search.cv_results_['params'][search.best_index_]`` gives the parameter setting for the best model, that gives the highest mean score (``search.best_score_``). scorer_ : function Scorer function used on the held out data to choose the best parameters for the model. n_splits_ : int The number of cross-validation splits (folds/iterations). Notes ------ The parameters selected are those that maximize the score of the left out data, unless an explicit score is passed in which case it is used instead. If `n_jobs` was set to a value higher than one, the data is copied for each point in the grid (and not `n_jobs` times). This is done for efficiency reasons if individual jobs take very little time, but may raise errors if the dataset is large and not enough memory is available. A workaround in this case is to set `pre_dispatch`. Then, the memory is copied only `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * n_jobs`. See Also --------- :class:`ParameterGrid`: generates all the combinations of a hyperparameter grid. :func:`sklearn.model_selection.train_test_split`: utility function to split the data into a development set usable for fitting a GridSearchCV instance and an evaluation set for its final evaluation. :func:`sklearn.metrics.make_scorer`: Make a scorer from a performance metric or loss function. """
[docs] def __init__(self, estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score=True): super(GridSearchCVJoblib, self).__init__( scoring=scoring, fit_params=fit_params, n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score) self.param_grid = param_grid _check_param_grid(param_grid)
[docs] def fit(self, X, y=None, groups=None): """Run fit with all sets of parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. """ return self._fit(X, y, groups, ParameterGrid(self.param_grid))
[docs]class BaseSearchCVSMAC(BaseSearchCV): """Base class for Bayesian hyper parameter search with cross-validation.""" def _fit(self, groups): """Actual fitting, performing the search over parameters.""" regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet'] isclassifier = \ not any(clf in regressors for clf in self.param_distributions['Classification']['classifiers']) cv = check_cv(self.cv, self.labels, classifier=isclassifier) self.features, self.labels, groups = indexable(self.features, self.labels, groups) n_splits = cv.get_n_splits(self.features, self.labels, groups) pre_dispatch = self.pre_dispatch cv_iter = list(cv.split(self.features, self.labels, groups)) # Build the SMAC configuration self.param_distributions['Other'] = dict() self.param_distributions['Other']['random_seed'] = np.random.randint(1, 5000) cs = build_smac_config(self.param_distributions) # Run the optimization # Here we will create and execute a fastr network # Create temporary directory for fastr if DebugDetector().do_detection(): # Specific name for easy debugging debugnum = 0 name = 'DEBUG_' + str(debugnum) tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name) while os.path.exists(tempfolder): debugnum += 1 name = 'DEBUG_' + str(debugnum) tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name) else: name = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name) if not os.path.exists(tempfolder): os.makedirs(tempfolder) # Create the files containing the estimator and settings estimator_labels = ['X', 'y', 'search_space', 'cv_iter', 'scoring', 'verbose', 'fit_params', 'return_train_score', 'return_n_test_samples', 'return_times', 'return_parameters', 'error_score', 'budget_type', 'budget', 'init_method', 'init_budget', 'smac_result_file'] estimator_data = pd.Series([self.features, self.labels, cs, cv_iter, self.scoring, False, self.fit_params, self.return_train_score, True, True, True, self.error_score, self.param_distributions['SMAC']['budget_type'], self.param_distributions['SMAC']['budget'], self.param_distributions['SMAC']['init_method'], self.param_distributions['SMAC']['init_budget'], self.smac_result_file], index=estimator_labels, name='estimator Data') fname = 'estimatordata.hdf5' estimatorname = os.path.join(tempfolder, fname) estimator_data.to_hdf(estimatorname, 'Estimator Data') estimatordata = f"vfs://tmp/GS/{name}/{fname}" # Create the files containing the instance data instance_labels = ['run_id', 'run_rng', 'run_name', 'tempfolder'] current_date_time = datetime.now() random_id = random.randint(1000, 9999) run_name = current_date_time.strftime('smac-run_' + '%m-%d_%H-%M-%S' + str(random_id)) instance_files = dict() for i in range(self.param_distributions['SMAC']['n_smac_cores']): instance_info = [i, random.randint(0, 2 ** 32 - 1), run_name, tempfolder] instance_data = pd.Series(instance_info, index=instance_labels, name=f'instance data {i}') fname = f'instancedata_{i}.hdf5' instancefolder = os.path.join(tempfolder, 'instances', fname) if not os.path.exists(os.path.dirname(instancefolder)): os.makedirs(os.path.dirname(instancefolder)) instance_data.to_hdf(instancefolder, 'Instance Data') instancedata = f'vfs://tmp/GS/{name}/instances/{fname}' instance_files[f'{i}'] = instancedata # Create the fastr network network = fastr.create_network('WORC_SMAC_' + name) estimator_data = network.create_source('HDF5', id='estimator_source') instance_data = network.create_source('HDF5', id='instance_source') sink_output = network.create_sink('HDF5', id='output') smac_node = network.create_node('worc/smac:1.0', tool_version='1.0', id='smac', resources=ResourceLimit(memory='5G')) smac_node.inputs['estimatordata'] = estimator_data.output smac_node.inputs['instancedata'] = instance_data.output sink_output.input = smac_node.outputs['fittedestimator'] source_data = {'estimator_source': estimatordata, 'instance_source': instance_files} sink_data = {'output': f"vfs://tmp/GS/{name}/output_{{sample_id}}_{{cardinality}}{{ext}}"} network.execute(source_data, sink_data, tmpdir=os.path.join(tempfolder, 'tmp'), execution_plugin=self.fastr_plugin) # Check whether all jobs have finished expected_no_files = len(instance_files) sink_files = glob.glob(os.path.join(fastr.config.mounts['tmp'], 'GS', name) + '/output*.hdf5') if len(sink_files) != expected_no_files: difference = expected_no_files - len(sink_files) fname = os.path.join(tempfolder, 'tmp') message = ('Fitting classifiers has failed for ' + f'{difference} / {expected_no_files} files. The temporary ' + f'results where not deleted and can be found in {tempfolder}. ' + 'Probably your fitting and scoring failed: check out ' + 'the tmp/smac folder within the tempfolder for ' + 'the fastr job temporary results or run: fastr trace ' + f'"{fname}{os.path.sep}__sink_data__.json" --samples.') raise WORCexceptions.WORCValueError(message) # Read in the output data once finished save_data = list() for output in sink_files: data = pd.read_hdf(output) save_data.extend(list(data['RET'])) # if one choose to see train score, "out" will contain train score info if self.return_train_score: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters_est, parameters_all) = \ zip(*save_data) else: (test_scores, test_sample_counts, fit_time, score_time, parameters_est, parameters_all) = \ zip(*save_data) # Process the smac_results data once finished # First read in the results of all smac instance files smac_filenames = glob.glob(os.path.join(tempfolder, 'tested_configs', run_name) + '/smac_stats_*.json') # Then create a combined dictionary with all # results of this cross-validation split and # a summary smac_results_for_this_cv = dict() smac_results_for_this_cv[run_name] = dict() summary = dict() all_costs = [] best_cost = 1 all_runtimes = [] for fn in smac_filenames: with open(fn, 'r') as f: smac_result = json.load(f) run_data = smac_result[list(smac_result.keys())[0]] nr_of_inc_updates = run_data['inc_changed'] current_cost = run_data['inc_costs'][nr_of_inc_updates - 1] all_costs.append(current_cost) all_runtimes.append(run_data['wallclock_time_used']) if current_cost < best_cost: best_cost = current_cost summary['best_score'] = current_cost summary['best_inc_wallclock_time'] = run_data['inc_wallclock_times'][nr_of_inc_updates - 1] summary['best_inc_evals'] = run_data['inc_evaluations'][nr_of_inc_updates - 1] summary['best_inc_changed'] = run_data['inc_changed'] summary['best_config'] = run_data['inc_configs'][nr_of_inc_updates - 1] smac_results_for_this_cv[run_name].update(smac_result) summary['average_score'] = np.mean(all_costs) summary['std_score'] = np.std(all_costs) summary['shortest_runtime'] = np.min(all_runtimes) summary['longest_runtime'] = np.max(all_runtimes) summary['average_runtime'] = np.mean(all_runtimes) summary['total_runtime'] = np.sum(all_runtimes) final_summary = {'cv-summary': summary} smac_results_for_this_cv[run_name].update(final_summary) result_file = self.smac_result_file if os.path.exists(result_file): with open(result_file, 'r') as jsonfile: results_so_far = json.load(jsonfile) results_so_far.update(smac_results_for_this_cv) with open(result_file, 'w') as jsonfile: json.dump(results_so_far, jsonfile, indent=4) else: with open(result_file, 'a') as jsonfile: json.dump(smac_results_for_this_cv, jsonfile, indent=4) # Remove the temporary folder used if name != 'DEBUG_0': # Do delete if not debugging for first iteration shutil.rmtree(tempfolder) # Process the results of the fitting procedure self.process_fit(n_splits=n_splits, parameters_all=parameters_all, test_sample_counts=test_sample_counts, test_score_dicts=test_scores, train_score_dicts=train_scores, fit_time=fit_time, score_time=score_time, cv_iter=cv_iter, X=self.features, y=self.labels, use_smac=True) return self
[docs]class GuidedSearchCVSMAC(BaseSearchCVSMAC): """Guided search on hyperparameters. GuidedSearchCV implements a "fit" and a "score" method. It also implements "predict", "predict_proba", "decision_function", "transform" and "inverse_transform" if they are implemented in the estimator used. The parameters of the estimator used to apply these methods are optimized by cross-validated search over parameter settings. The optimization is performed using the Sequential Model-based Algorithm Configuration (SMAC) method. A probabilistic model of the objective function is constructed and updated with each function evaluation. If all parameters are presented as a list, sampling without replacement is performed. If at least one parameter is given as a distribution, sampling with replacement is used. It is highly recommended to use continuous distributions for continuous parameters. Parameters ---------- param_distributions : dict Dictionary with parameter names (string) as keys and details of their domains as values. From this dictionary the complete search space will later be constructed. n_iter : int, default=10 Number of function evaluations allowed in each optimization sequence of SMAC. scoring : string, callable or None, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. If ``None``, the ``score`` method of the estimator is used. fit_params : dict, optional Parameters to pass to the fit method. n_jobs : int, default=1 Number of jobs to run in parallel. pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' iid : boolean, default=True If True, the data is assumed to be identically distributed across the folds, and the loss minimized is the total loss per sample, and not the mean loss across the folds. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validation strategies that can be used here. refit : boolean, default=True Refit the best estimator with the entire dataset. If "False", it is impossible to make predictions using this RandomizedSearchCV instance after fitting. verbose : integer Controls the verbosity: the higher, the more messages. random_state : int or RandomState Pseudo random number generator state used for random uniform sampling from lists of possible values instead of scipy.stats distributions. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. return_train_score : boolean, default=True If ``'False'``, the ``cv_results_`` attribute will not include training scores. Attributes ---------- cv_results_ : dict of numpy (masked) ndarrays A dict with keys as column headers and values as columns, that can be imported into a pandas ``DataFrame``. For instance the below given table +--------------+-------------+-------------------+---+---------------+ | param_kernel | param_gamma | split0_test_score |...|rank_test_score| +==============+=============+===================+===+===============+ | 'rbf' | 0.1 | 0.8 |...| 2 | +--------------+-------------+-------------------+---+---------------+ | 'rbf' | 0.2 | 0.9 |...| 1 | +--------------+-------------+-------------------+---+---------------+ | 'rbf' | 0.3 | 0.7 |...| 1 | +--------------+-------------+-------------------+---+---------------+ will be represented by a ``cv_results_`` dict of:: { 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'], mask = False), 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False), 'split0_test_score' : [0.8, 0.9, 0.7], 'split1_test_score' : [0.82, 0.5, 0.7], 'mean_test_score' : [0.81, 0.7, 0.7], 'std_test_score' : [0.02, 0.2, 0.], 'rank_test_score' : [3, 1, 1], 'split0_train_score' : [0.8, 0.9, 0.7], 'split1_train_score' : [0.82, 0.5, 0.7], 'mean_train_score' : [0.81, 0.7, 0.7], 'std_train_score' : [0.03, 0.03, 0.04], 'mean_fit_time' : [0.73, 0.63, 0.43, 0.49], 'std_fit_time' : [0.01, 0.02, 0.01, 0.01], 'mean_score_time' : [0.007, 0.06, 0.04, 0.04], 'std_score_time' : [0.001, 0.002, 0.003, 0.005], 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...], } NOTE that the key ``'params'`` is used to store a list of parameter settings dict for all the parameter candidates. The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and ``std_score_time`` are all in seconds. best_estimator_ : estimator Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data. Not available if refit=False. best_score_ : float Score of best_estimator on the left out data. best_params_ : dict Parameter setting that gave the best results on the hold out data. best_index_ : int The index (of the ``cv_results_`` arrays) which corresponds to the best candidate parameter setting. The dict at ``search.cv_results_['params'][search.best_index_]`` gives the parameter setting for the best model, that gives the highest mean score (``search.best_score_``). scorer_ : function Scorer function used on the held out data to choose the best parameters for the model. n_splits_ : int The number of cross-validation splits (folds/iterations). Notes ----- The parameters selected are those that maximize the score of the held-out data, according to the scoring parameter. If `n_jobs` was set to a value higher than one, the data is copied for each parameter setting(and not `n_jobs` times). This is done for efficiency reasons if individual jobs take very little time, but may raise errors if the dataset is large and not enough memory is available. A workaround in this case is to set `pre_dispatch`. Then, the memory is copied only `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * n_jobs`. See Also -------- :class:`GridSearchCV`: Does exhaustive search over a grid of parameters. :class:`ParameterSampler`: A generator over parameter settings, constructed from param_distributions. """
[docs] def __init__(self, param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, fastr_plugin=None, maxlen=100, ranking_score='test_score', features=None, labels=None, refit_training_workflows=False, refit_validation_workflows=False, smac_result_file=None): super(GuidedSearchCVSMAC, self).__init__( param_distributions=param_distributions, scoring=scoring, fit_params=fit_params, n_iter=n_iter, random_state=random_state, n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score, n_jobspercore=n_jobspercore, fastr_plugin=fastr_plugin, maxlen=maxlen, ranking_score=ranking_score, refit_training_workflows=refit_training_workflows, refit_validation_workflows=refit_validation_workflows) self.features = features self.labels = labels self.smac_result_file = smac_result_file
[docs] def fit(self, X, y=None, groups=None): """Run fit on the estimator with randomly drawn parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. """ print("Fit: " + str(self.n_iter)) self.features = X self.labels = y return self._fit(groups)