Source code for WORC.classification.SearchCV

#!/usr/bin/env python

# Copyright 2016-2021 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from abc import ABCMeta, abstractmethod
from collections.abc import Sized
import numpy as np
import warnings
import numbers
import random
import string
import fastr
from fastr.api import ResourceLimit
from joblib import Parallel, delayed
from scipy.stats import rankdata
import six
import pandas as pd
import json
import glob
from itertools import islice
import shutil

from sklearn.model_selection._search import ParameterSampler
from sklearn.model_selection._search import ParameterGrid, _check_param_grid
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, is_classifier, clone
from sklearn.base import MetaEstimatorMixin
from sklearn.exceptions import NotFittedError
from sklearn.utils.metaestimators import if_delegate_has_method
from sklearn.utils.validation import indexable, check_is_fitted
from sklearn.model_selection._split import check_cv
from sklearn.metrics import f1_score, roc_auc_score, mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.utils.validation import _check_fit_params
from sklearn.model_selection._validation import _aggregate_score_dicts

from WORC.classification.fitandscore import fit_and_score, replacenan
from WORC.classification.metrics import check_multimetric_scoring
from WORC.classification import construct_classifier as cc
from WORC.featureprocessing.Preprocessor import Preprocessor
from WORC.detectors.detectors import DebugDetector
import WORC.addexceptions as WORCexceptions


[docs]def rms_score(truth, prediction):
    """Root-mean-square-error metric."""
    return np.sqrt(mean_squared_error(truth, prediction))


[docs]def sar_score(truth, prediction):
    """SAR metric from Caruana et al. 2004."""
    ROC = roc_auc_score(truth, prediction)
    # Convert score to binaries first
    for num in range(0, len(prediction)):
        if prediction[num] >= 0.5:
            prediction[num] = 1
        else:
            prediction[num] = 0

    ACC = accuracy_score(truth, prediction)
    RMS = rms_score(truth, prediction)
    SAR = (ACC + ROC + (1 - RMS))/3
    return SAR


[docs]def chunksdict(data, SIZE):
    """Split a dictionary in equal parts of certain slice."""
    it = iter(data)
    for i in range(0, len(data), SIZE):
        yield {k: data[k] for k in islice(it, SIZE)}


[docs]def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


[docs]class Ensemble(six.with_metaclass(ABCMeta, BaseEstimator,
                                  MetaEstimatorMixin)):
    """Ensemble of BaseSearchCV Estimators."""

    # @abstractmethod
[docs]    def __init__(self, estimators):
        """Initialize object with list of estimators."""
        if not estimators:
            message = 'You supplied an empty list of estimators: No ensemble creation possible.'
            raise WORCexceptions.WORCValueError(message)
        self.estimators = estimators
        self.n_estimators = len(estimators)

[docs]    def predict(self, X):
        """Call predict on the estimator with the best found parameters.

        Only available if ``refit=True`` and the underlying estimator supports
        ``predict``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        self.estimators[0]._check_is_fitted('predict')

        # Check if we are dealing with multilabel
        if len(self.estimators[0].predict(X).shape) == 1:
            nlabels = 1
        else:
            nlabels = self.estimators[0].predict(X).shape[1]

        if type(self.estimators[0].best_estimator_) == OneVsRestClassifier:
            multilabel = True
        elif nlabels > 1:
            multilabel = True
        else:
            multilabel = False

        if multilabel:
            # Multilabel
            outcome = np.zeros((self.n_estimators, len(X), nlabels))
            for num, est in enumerate(self.estimators):
                if hasattr(est, 'predict_proba'):
                    # BUG: SVM kernel can be wrong type
                    if hasattr(est.best_estimator_, 'kernel'):
                        est.best_estimator_.kernel = str(est.best_estimator_.kernel)
                    outcome[num, :, :] = est.predict_proba(X)
                else:
                    outcome[num, :, :] = est.predict(X)

            # Replace NAN if they are there
            if np.isnan(outcome).any():
                print('[WARNING] Predictions contain NaN, removing those rows.')
                outcome = outcome[~np.isnan(outcome).any(axis=1)]

            outcome = np.squeeze(np.mean(outcome, axis=0))

            # NOTE: Binarize specifically for multiclass
            for i in range(0, outcome.shape[0]):
                label = np.argmax(outcome[i, :])
                outcome[i, :] = np.zeros(outcome.shape[1])
                outcome[i, label] = 1

        else:
            # Singlelabel
            outcome = np.zeros((self.n_estimators, len(X)))
            for num, est in enumerate(self.estimators):
                if hasattr(est, 'predict_proba'):
                    # BUG: SVM kernel can be wrong type
                    if hasattr(est.best_estimator_, 'kernel'):
                        est.best_estimator_.kernel = str(est.best_estimator_.kernel)
                    outcome[num, :] = est.predict_proba(X)[:, 1]
                else:
                    outcome[num, :] = est.predict(X)

            # Replace NAN if they are there
            outcome = outcome[~np.isnan(outcome).any(axis=1)]

            outcome = np.squeeze(np.mean(outcome, axis=0))

            # Binarize
            isclassifier = is_classifier(est.best_estimator_)

            if isclassifier:
                outcome[outcome >= 0.5] = 1
                outcome[outcome < 0.5] = 0

        return outcome

[docs]    def predict_proba(self, X):
        """Call predict_proba on the estimator with the best found parameters.

        Only available if ``refit=True`` and the underlying estimator supports
        ``predict_proba``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        self.estimators[0]._check_is_fitted('predict_proba')

        # Check if we are dealing with multilabel
        if len(self.estimators[0].predict(X).shape) == 1:
            nlabels = 1
        else:
            nlabels = self.estimators[0].predict(X).shape[1]

        if type(self.estimators[0].best_estimator_) == OneVsRestClassifier:
            multilabel = True
        elif nlabels > 1:
            multilabel = True
        else:
            multilabel = False

        if multilabel:
            # Multilabel
            outcome = np.zeros((self.n_estimators, len(X), nlabels))
            for num, est in enumerate(self.estimators):
                if hasattr(est, 'predict_proba'):
                    # BUG: SVM kernel can be wrong type
                    if hasattr(est.best_estimator_, 'kernel'):
                        est.best_estimator_.kernel = str(est.best_estimator_.kernel)
                    outcome[num, :, :] = est.predict_proba(X)
                else:
                    outcome[num, :, :] = est.predict(X)

            # Replace NAN if they are there
            if np.isnan(outcome).any():
                print('[WARNING] Predictions contain NaN, removing those rows.')
                outcome = outcome[~np.isnan(outcome).any(axis=1)]

            outcome = np.squeeze(np.mean(outcome, axis=0))
        else:
            # Single label
            # For probabilities, we get both a class0 and a class1 score
            outcome = np.zeros((len(X), 2))
            outcome_class1 = np.zeros((self.n_estimators, len(X)))
            outcome_class2 = np.zeros((self.n_estimators, len(X)))
            for num, est in enumerate(self.estimators):
                # BUG: SVM kernel can be wrong type
                if hasattr(est.best_estimator_, 'kernel'):
                    est.best_estimator_.kernel = str(est.best_estimator_.kernel)
                outcome_class1[num, :] = est.predict_proba(X)[:, 0]
                outcome_class2[num, :] = est.predict_proba(X)[:, 1]

            outcome[:, 0] = np.squeeze(np.mean(outcome_class1, axis=0))
            outcome[:, 1] = np.squeeze(np.mean(outcome_class2, axis=0))

        return outcome

[docs]    def predict_log_proba(self, X):
        """Call predict_log_proba on the estimator with the best found parameters.

        Only available if ``refit=True`` and the underlying estimator supports
        ``predict_log_proba``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        self.estimators[0]._check_is_fitted('predict_log_proba')

        outcome = np.zeros((self.n_estimators, len(X)))
        for num, est in enumerate(self.estimators):
            outcome[num, :] = est.predict_log_proba(X)

        outcome = np.squeeze(np.mean(outcome, axis=0))
        return outcome

[docs]    def decision_function(self, X):
        """Call decision_function on the estimator with the best found parameters.

        Only available if ``refit=True`` and the underlying estimator supports
        ``decision_function``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        self.estimators[0]._check_is_fitted('decision_function')

        # NOTE: Check if we are dealing with multilabel
        if type(self.estimators[0].best_estimator_) == OneVsRestClassifier:
            # Multilabel
            nlabels = self.estimators[0].decision_function(X).shape[1]
            outcome = np.zeros((self.n_estimators, len(X), nlabels))
            for num, est in enumerate(self.estimators):
                outcome[num, :, :] = est.decision_function(X)

            outcome = np.squeeze(np.mean(outcome, axis=0))

        else:
            # Singlelabel
            outcome = np.zeros((self.n_estimators, len(X)))
            for num, est in enumerate(self.estimators):
                outcome[num, :] = est.decision_function(X)

            outcome = np.squeeze(np.mean(outcome, axis=0))

        return outcome

[docs]    def transform(self, X):
        """Call transform on the estimator with the best found parameters.

        Only available if the underlying estimator supports ``transform`` and
        ``refit=True``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        self.estimators[0]._check_is_fitted('transform')

        outcome = np.zeros((self.n_estimators, len(X)))
        for num, est in enumerate(self.estimators):
            outcome[num, :] = est.transform(X)

        outcome = np.squeeze(np.mean(outcome, axis=0))
        return outcome

[docs]    def inverse_transform(self, Xt):
        """Call inverse_transform on the estimator with the best found params.

        Only available if the underlying estimator implements
        ``inverse_transform`` and ``refit=True``.

        Parameters
        -----------
        Xt : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        self.estimators[0]._check_is_fitted('inverse_transform')

        outcome = np.zeros((self.n_estimators, len(Xt)))
        for num, est in enumerate(self.estimators):
            outcome[num, :] = est.transform(Xt)

        outcome = np.squeeze(np.mean(outcome, axis=0))
        return outcome


[docs]class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator,
                                      MetaEstimatorMixin)):
    """Base class for hyper parameter search with cross-validation."""

[docs]    @abstractmethod
    def __init__(self, param_distributions={}, n_iter=10, scoring=None,
                 fit_params=None, n_jobs=1, iid=True,
                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
                 random_state=None, error_score='raise',
                 return_train_score=True,
                 n_jobspercore=100, maxlen=100, fastr_plugin=None, memory='2G',
                 ranking_score='test_score', refit_workflows=False):
        """Initialize SearchCV Object."""
        # Added for fastr and joblib executions
        self.param_distributions = param_distributions
        self.n_iter = n_iter
        self.n_jobspercore = n_jobspercore
        self.random_state = random_state
        self.ensemble = list()
        self.fastr_plugin = fastr_plugin
        self.memory = memory

        # Below are the defaults from sklearn
        self.scoring = scoring
        self.n_jobs = n_jobs
        self.fit_params = fit_params if fit_params is not None else {}
        self.iid = iid
        self.refit = refit
        self.cv = cv
        self.verbose = verbose
        self.pre_dispatch = pre_dispatch

        # Manually added steps
        self.error_score = error_score
        self.return_train_score = return_train_score
        self.maxlen = maxlen
        self.ranking_score = ranking_score
        self.refit_workflows = refit_workflows
        self.fitted_workflows = list()

        # Only for WORC Paper
        self.test_RS = True

    @property
    def _estimator_type(self):
        return self.estimator._estimator_type

[docs]    def score(self, X, y=None):
        """Compute the score (i.e. probability) on a given data.

        This uses the score defined by ``scoring`` where provided, and the
        ``best_estimator_.score`` method otherwise.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Input data, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        Returns
        -------
        score : float

        """
        if self.scorer_ is None:
            raise ValueError("No score function explicitly defined, "
                             "and the estimator doesn't provide one %s"
                             % self.best_estimator_)

        X, y = self.preprocess(X, y)

        return self.scorer_(self.best_estimator_, X, y)

    def _check_is_fitted(self, method_name):
        if not self.refit:
            raise NotFittedError(('This GridSearchCV instance was initialized '
                                  'with refit=False. %s is '
                                  'available only after refitting on the best '
                                  'parameters. ') % method_name)
        else:
            check_is_fitted(self, 'best_estimator_')

[docs]    @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
    def predict(self, X):
        """Call predict on the estimator with the best found parameters.

        Only available if ``refit=True`` and the underlying estimator supports
        ``predict``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        self._check_is_fitted('predict')

        if self.ensemble:
            return self.ensemble.predict(X)
        else:
            X, _ = self.preprocess(X)
            return self.best_estimator_.predict(X)

[docs]    @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
    def predict_proba(self, X):
        """Call predict_proba on the estimator with the best found parameters.

        Only available if ``refit=True`` and the underlying estimator supports
        ``predict_proba``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        self._check_is_fitted('predict_proba')

        # BUG: kernel sometimes saved as unicode
        # BUG: SVM kernel can be wrong type
        if hasattr(self.best_estimator_, 'kernel'):
            self.best_estimator_.kernel = str(self.best_estimator_.kernel)
        if self.ensemble:
            return self.ensemble.predict_proba(X)
        else:
            X, _ = self.preprocess(X)
            return self.best_estimator_.predict_proba(X)

[docs]    @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
    def predict_log_proba(self, X):
        """Call predict_log_proba on the estimator with the best found parameters.

        Only available if ``refit=True`` and the underlying estimator supports
        ``predict_log_proba``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        self._check_is_fitted('predict_log_proba')

        # BUG: SVM kernel can be wrong type
        if hasattr(self.est.best_estimator_, 'kernel'):
            self.best_estimator_.kernel = str(self.best_estimator_.kernel)

        if self.ensemble:
            return self.ensemble.predict_log_proba(X)
        else:
            X, _ = self.preprocess(X)
            return self.best_estimator_.predict_log_proba(X)

[docs]    @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
    def decision_function(self, X):
        """Call decision_function on the estimator with the best found parameters.

        Only available if ``refit=True`` and the underlying estimator supports
        ``decision_function``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        self._check_is_fitted('decision_function')

        if self.ensemble:
            return self.ensemble.decision_function(X)
        else:
            X, _ = self.preprocess(X)
            return self.best_estimator_.decision_function(X)

[docs]    @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
    def transform(self, X):
        """Call transform on the estimator with the best found parameters.

        Only available if the underlying estimator supports ``transform`` and
        ``refit=True``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        self._check_is_fitted('transform')

        if self.ensemble:
            return self.ensemble.transform(X)
        else:
            X = self.preprocess(X)
            return self.best_estimator_.transform(X)

[docs]    @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
    def inverse_transform(self, Xt):
        """Call inverse_transform on the estimator with the best found params.

        Only available if the underlying estimator implements
        ``inverse_transform`` and ``refit=True``.

        Parameters
        -----------
        Xt : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        self._check_is_fitted('inverse_transform')

        if self.ensemble:
            return self.ensemble.transform(Xt)
        else:
            Xt, _ = self.preprocess(Xt)
            return self.best_estimator_.transform(Xt)

[docs]    def preprocess(self, X, y=None, training=False):
        """Apply the available preprocssing methods to the features."""
        if self.best_preprocessor is not None:
            X = self.best_preprocessor.transform(X)

        if self.best_encoder is not None:
            X = self.best_encoder.transform(X)

        if self.best_imputer is not None:
            X = self.best_imputer.transform(X)

        # Replace nan if still left
        X = replacenan(np.asarray(X)).tolist()

        if self.best_groupsel is not None:
            X = self.best_groupsel.transform(X)

        if self.best_varsel is not None:
            X = self.best_varsel.transform(X)

        if not training and hasattr(self, 'overfit_scaler') and self.overfit_scaler:
            # Overfit the feature scaling on the test set
            # NOTE: Never use this in an actual model, only to assess how
            # different your features are in your train and test sets
            m = '[WORC WARNING] You choose to overfit the feature scaling. ' +\
                'Never use this in an actual model, only to assess how ' +\
                'different your features are in your train and test sets.'
            print(m)
            scaler = StandardScaler().fit(X)

            if scaler is not None:
                X = scaler.transform(X)
        else:
            if self.best_scaler is not None:
                X = self.best_scaler.transform(X)

        if self.best_reliefsel is not None:
            X = self.best_reliefsel.transform(X)

        if self.best_modelsel is not None:
            X = self.best_modelsel.transform(X)

        if self.best_pca is not None:
            X = self.best_pca.transform(X)

        if self.best_statisticalsel is not None:
            X = self.best_statisticalsel.transform(X)

        # Only resampling in training phase, i.e. if we have the labels
        if y is not None:
            if self.best_Sampler is not None:
                X, y = self.best_Sampler.transform(X, y)

        return X, y

[docs]    def process_fit(self, n_splits, parameters_all,
                    test_sample_counts, test_score_dicts,
                    train_score_dicts, fit_time, score_time, cv_iter,
                    X, y, fitted_workflows=None):
        """Process a fit.

        Process the outcomes of a SearchCV fit and find the best settings
        over all cross validations from all hyperparameters tested

        Very similar to the _format_results function or the original SearchCV.

        """
        # test_score_dicts and train_score dicts are lists of dictionaries and
        # we make them into dict of lists
        if self.verbose:
            print('Processing fits.')
        test_scores = _aggregate_score_dicts(test_score_dicts)
        if self.return_train_score:
            train_scores = _aggregate_score_dicts(train_score_dicts)

        # We take only one result per split, default by sklearn
        pipelines_per_split = int(len(parameters_all) / n_splits)
        candidate_params_all = list(parameters_all[:pipelines_per_split])
        n_candidates = len(candidate_params_all)

        # Store some of the resulting scores
        results = dict()

        # Computed the (weighted) mean and std for test scores alone
        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_."""
            array = np.transpose(np.array(array, dtype=np.float64).reshape(n_splits,
                                                                           n_candidates))

            if splits:
                for split_i in range(n_splits):
                    results["split%d_%s"
                            % (split_i, key_name)] = array[:, split_i]

            try:
                array_means = np.average(array, axis=1, weights=weights)
            except ZeroDivisionError as e:
                e = f'[WORC Warning] {e}. Setting {key_name} to unweighted.'
                print(e)
                array_means = np.average(array, axis=1)

            results['mean_%s' % key_name] = array_means

            array_mins = np.min(array, axis=1)
            results['min_%s' % key_name] = array_mins

            # Weighted std is not directly available in numpy
            try:
                array_stds = np.sqrt(np.average((array -
                                                 array_means[:, np.newaxis]) ** 2,
                                                axis=1, weights=weights))
            except ZeroDivisionError as e:
                e = f'[WORC Warning] {e}. Setting {key_name} to unweighted.'
                print(e)
                array_stds = np.sqrt(np.average((array -
                                                 array_means[:, np.newaxis]) ** 2,
                                                axis=1))

            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(
                    rankdata(-array_means, method='min'), dtype=np.int32)

        _store('fit_time', fit_time)
        _store('score_time', score_time)

        # Store scores
        # Check whether to do multimetric scoring
        test_estimator = cc.construct_classifier(candidate_params_all[0])
        scorers, self.multimetric_ = check_multimetric_scoring(
            test_estimator, scoring=self.scoring)

        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)

        if self.iid != 'deprecated':
            warnings.warn(
                "The parameter 'iid' is deprecated in 0.22 and will be "
                "removed in 0.24.", FutureWarning
            )
            iid = self.iid
        else:
            iid = False

        icheck = 0
        for scorer_name in scorers.keys():
            # Computed the (weighted) mean and std for test scores alone
            key_name = 'test_%s' % scorer_name
            _store('test_%s' % scorer_name, test_scores[scorer_name],
                   splits=True, rank=True,
                   weights=test_sample_counts if iid else None)

            if DebugDetector().do_detection() and icheck == 0:
                # Check the scores for some splits
                for i in range(10):
                    print('Iteration: ' + str(i))
                    print(test_scores[scorer_name][i])
                    print(results["split%d_%s" % (0, key_name)][i])
                    print(test_scores[scorer_name][i + 10])
                    print(results["split%d_%s" % (1, key_name)][i])
                    print(results['mean_%s' % key_name][i])
                    print('\n')
                    icheck += 1

            if self.return_train_score:
                _store('train_%s' % scorer_name, train_scores[scorer_name],
                       splits=True)

        # Compute the "Generalization" score
        difference_score = abs(results['mean_train_score'] - results['mean_test_score'])
        generalization_score = results['mean_test_score'] - difference_score
        results['generalization_score'] = generalization_score
        results['rank_generalization_score'] = np.asarray(
            rankdata(-results['generalization_score'], method='min'), dtype=np.int32)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, str) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers) and not callable(self.refit):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key or a "
                                 "callable to refit an estimator with the "
                                 "best parameter setting on the whole "
                                 "data and make the best_* attributes "
                                 "available for that metric. If this is "
                                 "not needed, refit should be set to "
                                 "False explicitly. %r was passed."
                                 % self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            # If callable, refit is expected to return the index of the best
            # parameter set.
            if callable(self.refit):
                self.best_index_ = self.refit(results)
                if not isinstance(self.best_index_, numbers.Integral):
                    raise TypeError('best_index_ returned is not an integer')
                if (self.best_index_ < 0 or
                   self.best_index_ >= len(results["params"])):
                    raise IndexError('best_index_ index out of range')
            else:
                self.best_index_ = results["rank_test_%s"
                                           % refit_metric].argmin()
                self.best_score_ = results["mean_test_%s" % refit_metric][
                                           self.best_index_]
            self.best_params_ = candidate_params_all[self.best_index_]

        # Rank the indices of scores from all parameter settings
        ranked_test_scores = results["rank_" + self.ranking_score]
        indices = range(0, len(ranked_test_scores))
        sortedindices = [x for _, x in sorted(zip(ranked_test_scores, indices))]

        # In order to reduce the memory used, we will only save
        # a maximum of results
        maxlen = min(self.maxlen, n_candidates)
        bestindices = sortedindices[0:maxlen]

        candidate_params_all = np.asarray(candidate_params_all)[bestindices].tolist()
        for k in results.keys():
            results[k] = results[k][bestindices]
        n_candidates = len(candidate_params_all)
        results['params'] = candidate_params_all

        # Store the atributes of the best performing estimator
        best_index = np.flatnonzero(results["rank_" + self.ranking_score] == 1)[0]
        best_parameters_all = candidate_params_all[best_index]

        # Store several objects
        self.cv_results_ = results
        self.n_splits_ = n_splits
        self.cv_iter = cv_iter
        self.best_index_ = best_index
        self.best_params_ = results["params"][self.best_index_]

        if self.refit:
            # We always refit on the full dataset
            indices = np.arange(0, len(y))
            self.refit_and_score(X, y, best_parameters_all,
                                 train=indices, test=indices)

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        # Refit the top performing workflows on the full training dataset
        if self.refit_workflows:
            # Select only from one train-val split, as they are identical
            fitted_workflows = fitted_workflows[:pipelines_per_split]

            # Sort according to best indices
            fitted_workflows = [fitted_workflows[i] for i in bestindices]

            # Remove None workflows
            fitted_workflows = [f for f in fitted_workflows if f is not None]

            self.fitted_workflows = fitted_workflows

        return self

[docs]    def refit_and_score(self, X, y, parameters_all,
                        train, test, verbose=None):
        """Refit the base estimator and attributes such as GroupSel.

        Parameters
        ----------
        X: array, mandatory
                Array containingfor each object (rows) the feature values
                (1st Column) and the associated feature label (2nd Column).

        y: list(?), mandatory
                List containing the labels of the objects.

        parameters_all: dictionary, mandatory
                Contains the settings used for the all preprocessing functions
                and the fitting. TODO: Create a default object and show the
                fields.

        train: list, mandatory
                Indices of the objects to be used as training set.

        test: list, mandatory
                Indices of the objects to be used as testing set.

        """
        if verbose is None:
            verbose = self.verbose

        # Preprocess features if required
        if 'FeatPreProcess' in parameters_all:
            if parameters_all['FeatPreProcess'] == 'True':
                print("Preprocessing features.")
                feature_values = np.asarray([x[0] for x in X])
                feature_labels = np.asarray([x[1] for x in X])
                preprocessor = Preprocessor(verbose=False)
                preprocessor.fit(feature_values, feature_labels=feature_labels[0, :])
                feature_values = preprocessor.transform(feature_values)
                feature_labels = preprocessor.transform(feature_labels)
                X_fit = [(values, labels) for values, labels in zip(feature_values, feature_labels)]
            else:
                X_fit = X
                preprocessor = None
        else:
            X_fit = X
            preprocessor = None

        # Refit all preprocessing functions
        fit_params = _check_fit_params(X_fit, self.fit_params)
        out = fit_and_score(X_fit, y, self.scoring,
                            train, test, parameters_all,
                            fit_params=fit_params,
                            return_train_score=self.return_train_score,
                            return_n_test_samples=True,
                            return_times=True, return_parameters=False,
                            return_estimator=True,
                            error_score=self.error_score,
                            verbose=verbose,
                            return_all=True)

        # Associate best options with new fits
        (save_data, GroupSel, VarSel, SelectModel, feature_labels, scalers,
            encoders, Imputers, PCAs, StatisticalSel, ReliefSel, Sampler) = out
        fitted_estimator = save_data[-2]
        self.best_groupsel = GroupSel
        self.best_scaler = scalers
        self.best_varsel = VarSel
        self.best_modelsel = SelectModel
        self.best_preprocessor = preprocessor
        self.best_imputer = Imputers
        self.best_encoder = encoders
        self.best_pca = PCAs
        self.best_featlab = feature_labels
        self.best_statisticalsel = StatisticalSel
        self.best_reliefsel = ReliefSel
        self.best_Sampler = Sampler
        self.best_estimator_ = fitted_estimator
        self.best_params_ = parameters_all

        return self

[docs]    def create_ensemble(self, X_train, Y_train, verbose=None, initialize=True,
                        scoring=None, method=50, overfit_scaler=False):
        """Create ensemble of multiple workflows.

        Create an (optimal) ensemble of a combination of hyperparameter settings
        and the associated groupsels, PCAs, estimators etc.

        Based on Caruana et al. 2004, but a little different:

        1. Recreate the training/validation splits for a n-fold cross validation.
        2. For each fold:
            a. Start with an empty ensemble
            b. Create starting ensemble by adding N individually best performing
               models on the validation set. N is tuned on the validation set.
            c. Add model that improves ensemble performance on validation set the most, with replacement.
            d. Repeat (c) untill performance does not increase

        The performance metric is the same as for the original hyperparameter
        search, i.e. probably the F1-score for classification and r2-score
        for regression. However, we recommend using the SAR score, as this is
        more universal.

        Method: top50 or Caruana

        """
        # Define a function for scoring the performance of a classifier
        def compute_performance(scoring, Y_valid_truth, Y_valid_score):
            if scoring == 'f1_weighted':
                # Convert score to binaries first
                for num in range(0, len(Y_valid_score)):
                    if Y_valid_score[num] >= 0.5:
                        Y_valid_score[num] = 1
                    else:
                        Y_valid_score[num] = 0

                perf = f1_score(Y_valid_truth, Y_valid_score, average='weighted')
            elif scoring == 'f1':
                # Convert score to binaries first
                for num in range(0, len(Y_valid_score)):
                    if Y_valid_score[num] >= 0.5:
                        Y_valid_score[num] = 1
                    else:
                        Y_valid_score[num] = 0

                perf = f1_score(Y_valid_truth, Y_valid_score, average='macro')
            elif scoring == 'auc':
                perf = roc_auc_score(Y_valid_truth, Y_valid_score)
            elif scoring == 'sar':
                perf = sar_score(Y_valid_truth, Y_valid_score)
            else:
                raise KeyError('[WORC Warning] No valid score method given in ensembling: ' + str(scoring))

            return perf

        if verbose is None:
            verbose = self.verbose

        if scoring is None:
            scoring = self.scoring

        # Get settings for best 100 estimators
        parameters_all = self.cv_results_['params']
        n_classifiers = len(parameters_all)
        n_iter = len(self.cv_iter)

        # Create a new base object for the ensemble components
        if type(self) == RandomizedSearchCVfastr:
            base_estimator = RandomizedSearchCVfastr()
        elif type(self) == RandomizedSearchCVJoblib:
            base_estimator = RandomizedSearchCVJoblib()

        if type(method) is int:
            # Simply take the top50 best hyperparameters
            if verbose:
                print(f'Creating ensemble using top {str(method)} individual classifiers.')
            if method == 1:
                # Next functions expect list
                ensemble = [0]
            else:
                ensemble = range(0, method)

        elif method == 'FitNumber':
            # Use optimum number of models

            # In order to speed up the process, we precompute all scores of the possible
            # classifiers in all cross validation estimatons

            # Create the training and validation set scores
            if verbose:
                print('Precomputing scores on training and validation set.')
            Y_valid_score = list()
            Y_valid_truth = list()
            performances = np.zeros((n_iter, n_classifiers))
            for it, (train, valid) in enumerate(self.cv_iter):
                if verbose:
                    print(f' - iteration {it + 1} / {n_iter}.')
                Y_valid_score_it = np.zeros((n_classifiers, len(valid)))

                # Loop over the 100 best estimators
                for num, p_all in enumerate(parameters_all):
                    # NOTE: Explicitly exclude validation set, elso refit and score
                    # somehow still seems to use it.
                    X_train_temp = [X_train[i] for i in train]
                    Y_train_temp = [Y_train[i] for i in train]
                    train_temp = np.arange(0, len(train))

                    # Refit a SearchCV object with the provided parameters
                    base_estimator.refit_and_score(X_train_temp, Y_train_temp, p_all,
                                                   train_temp, train_temp,
                                                   verbose=False)

                    # Predict and save scores
                    X_train_values = [x[0] for x in X_train] # Throw away labels
                    X_train_values_valid = [X_train_values[i] for i in valid]
                    Y_valid_score_temp = base_estimator.predict_proba(X_train_values_valid)

                    # Only take the probabilities for the second class
                    Y_valid_score_temp = Y_valid_score_temp[:, 1]

                    # Append to array for all classifiers on this validation set
                    Y_valid_score_it[num, :] = Y_valid_score_temp

                    if num == 0:
                        # Also store the validation ground truths
                        Y_valid_truth.append(Y_train[valid])

                    performances[it, num] = compute_performance(scoring,
                                                                Y_train[valid],
                                                                Y_valid_score_temp)

                Y_valid_score.append(Y_valid_score_it)

            # Sorted Ensemble Initialization -------------------------------------
            # Go on adding to the ensemble untill we find the optimal performance
            # Initialize variables

            # Note: doing this in a greedy way doesnt work. We compute the
            # performances for the ensembles of lengt [1, n_classifiers] and
            # select the optimum
            best_performance = 0
            new_performance = 0.001
            iteration = 0
            ensemble = list()
            y_score = [None]*n_iter
            best_index = 0
            single_estimator_performance = new_performance

            if initialize:
                # Rank the models based on scoring on the validation set
                performances = np.mean(performances, axis=0)
                sortedindices = np.argsort(performances)[::-1]
                performances_n_class = list()

                if verbose:
                    print("\n")
                    print('Sorted Ensemble Initialization.')
                # while new_performance > best_performance:
                for dummy in range(0, n_classifiers):
                    # Score is better, so expand ensemble and replace new best score
                    best_performance = new_performance

                    if iteration > 1:
                        # Stack scores: not needed for first iteration
                        ensemble.append(best_index)
                        # N_models += 1
                        for num in range(0, n_iter):
                            y_score[num] = np.vstack((y_score[num], Y_valid_score[num][ensemble[-1], :]))

                    elif iteration == 1:
                        # Create y_score object for second iteration
                        single_estimator_performance = new_performance
                        ensemble.append(best_index)
                        # N_models += 1
                        for num in range(0, n_iter):
                            y_score[num] = Y_valid_score[num][ensemble[-1], :]

                    # Perform n-fold cross validation to estimate performance of next best classifier
                    performances_temp = np.zeros((n_iter))
                    for n_crossval in range(0, n_iter):
                        # For each estimator, add the score to the ensemble and new ensemble performance
                        if iteration == 0:
                            # No y_score yet, so we need to build it instead of stacking
                            y_valid_score_new = Y_valid_score[n_crossval][sortedindices[iteration], :]
                        else:
                            # Stack scores of added model on top of previous scores and average
                            y_valid_score_new = np.mean(np.vstack((y_score[n_crossval], Y_valid_score[n_crossval][sortedindices[iteration], :])), axis=0)

                        perf = compute_performance(scoring, Y_valid_truth[n_crossval], y_valid_score_new)
                        performances_temp[n_crossval] = perf

                    # Check which ensemble should be in the ensemble to maximally improve
                    new_performance = np.mean(performances_temp)
                    performances_n_class.append(new_performance)
                    best_index = sortedindices[iteration]
                    iteration += 1

                # Select N_models for initialization
                new_performance = max(performances_n_class)
                N_models = performances_n_class.index(new_performance) + 1  # +1 due to python indexing
                ensemble = ensemble[0:N_models]
                best_performance = new_performance

                # Print the performance gain
                print(f"Ensembling best {scoring}: {best_performance}.")
                print(f"Single estimator best {scoring}: {single_estimator_performance}.")
                print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')

        elif method == 'Caruana':
            # Use the method from Caruana
            if verbose:
                print('Creating ensemble with Caruana method.')

            # In order to speed up the process, we precompute all scores of the possible
            # classifiers in all cross validation estimatons

            # Create the training and validation set scores
            if verbose:
                print('Precomputing scores on training and validation set.')
            Y_valid_score = list()
            Y_valid_truth = list()
            performances = np.zeros((n_iter, n_classifiers))
            for it, (train, valid) in enumerate(self.cv_iter):
                if verbose:
                    print(f' - iteration {it + 1} / {n_iter}.')
                Y_valid_score_it = np.zeros((n_classifiers, len(valid)))

                # Loop over the 100 best estimators
                for num, p_all in enumerate(parameters_all):
                    # NOTE: Explicitly exclude validation set, elso refit and score
                    # somehow still seems to use it.
                    X_train_temp = [X_train[i] for i in train]
                    Y_train_temp = [Y_train[i] for i in train]
                    train_temp = np.arange(0, len(train))

                    # Refit a SearchCV object with the provided parameters
                    base_estimator.refit_and_score(X_train_temp, Y_train_temp, p_all,
                                                   train_temp, train_temp,
                                                   verbose=False)

                    # Predict and save scores
                    X_train_values = [x[0] for x in X_train] # Throw away labels
                    X_train_values_valid = [X_train_values[i] for i in valid]
                    Y_valid_score_temp = base_estimator.predict_proba(X_train_values_valid)

                    # Only take the probabilities for the second class
                    Y_valid_score_temp = Y_valid_score_temp[:, 1]

                    # Append to array for all classifiers on this validation set
                    Y_valid_score_it[num, :] = Y_valid_score_temp

                    if num == 0:
                        # Also store the validation ground truths
                        Y_valid_truth.append(Y_train[valid])

                    performances[it, num] = compute_performance(scoring,
                                                                Y_train[valid],
                                                                Y_valid_score_temp)

                Y_valid_score.append(Y_valid_score_it)

            # Sorted Ensemble Initialization -------------------------------------
            # Go on adding to the ensemble untill we find the optimal performance
            # Initialize variables

            # Note: doing this in a greedy way doesnt work. We compute the
            # performances for the ensembles of lengt [1, n_classifiers] and
            # select the optimum
            best_performance = 0
            new_performance = 0.001
            iteration = 0
            ensemble = list()
            y_score = [None]*n_iter
            best_index = 0
            single_estimator_performance = new_performance

            if initialize:
                # Rank the models based on scoring on the validation set
                performances = np.mean(performances, axis=0)
                sortedindices = np.argsort(performances)[::-1]
                performances_n_class = list()

                if verbose:
                    print("\n")
                    print('Sorted Ensemble Initialization.')
                # while new_performance > best_performance:
                for dummy in range(0, n_classifiers):
                    # Score is better, so expand ensemble and replace new best score
                    best_performance = new_performance

                    if iteration > 1:
                        # Stack scores: not needed for first iteration
                        ensemble.append(best_index)
                        # N_models += 1
                        for num in range(0, n_iter):
                            y_score[num] = np.vstack((y_score[num], Y_valid_score[num][ensemble[-1], :]))

                    elif iteration == 1:
                        # Create y_score object for second iteration
                        single_estimator_performance = new_performance
                        ensemble.append(best_index)
                        # N_models += 1
                        for num in range(0, n_iter):
                            y_score[num] = Y_valid_score[num][ensemble[-1], :]

                    # Perform n-fold cross validation to estimate performance of next best classifier
                    performances_temp = np.zeros((n_iter))
                    for n_crossval in range(0, n_iter):
                        # For each estimator, add the score to the ensemble and new ensemble performance
                        if iteration == 0:
                            # No y_score yet, so we need to build it instead of stacking
                            y_valid_score_new = Y_valid_score[n_crossval][sortedindices[iteration], :]
                        else:
                            # Stack scores of added model on top of previous scores and average
                            y_valid_score_new = np.mean(np.vstack((y_score[n_crossval], Y_valid_score[n_crossval][sortedindices[iteration], :])), axis=0)

                        perf = compute_performance(scoring, Y_valid_truth[n_crossval], y_valid_score_new)
                        performances_temp[n_crossval] = perf

                    # Check which ensemble should be in the ensemble to maximally improve
                    new_performance = np.mean(performances_temp)
                    performances_n_class.append(new_performance)
                    best_index = sortedindices[iteration]
                    iteration += 1

                # Select N_models for initialization
                new_performance = max(performances_n_class)
                N_models = performances_n_class.index(new_performance) + 1  # +1 due to python indexing
                ensemble = ensemble[0:N_models]
                best_performance = new_performance

                # Print the performance gain
                print(f"Ensembling best {scoring}: {best_performance}.")
                print(f"Single estimator best {scoring}: {single_estimator_performance}.")
                print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')

            # Greedy selection  -----------------------------------------------
            # Initialize variables
            best_performance -= 1e-10
            iteration = 0

            # Go on adding to the ensemble untill we find the optimal performance
            if verbose:
                print("\n")
                print('Greedy selection.')
            while new_performance > best_performance:
                # Score is better, so expand ensemble and replace new best score
                if verbose:
                    print(f"Iteration: {iteration}, best {scoring}: {new_performance}.")
                best_performance = new_performance

                if iteration > 1:
                    # Stack scores: not needed for first iteration
                    ensemble.append(best_index)
                    for num in range(0, n_iter):
                        y_score[num] = np.vstack((y_score[num], Y_valid_score[num][ensemble[-1], :]))

                elif iteration == 1:
                    if not initialize:
                        # Create y_score object for second iteration
                        single_estimator_performance = new_performance
                        ensemble.append(best_index)
                        for num in range(0, n_iter):
                            y_score[num] = Y_valid_score[num][ensemble[-1], :]
                    else:
                        # Stack scores: not needed when ensemble initialization is already used
                        ensemble.append(best_index)
                        for num in range(0, n_iter):
                            y_score[num] = np.vstack((y_score[num], Y_valid_score[num][ensemble[-1], :]))

                # Perform n-fold cross validation to estimate performance of each possible addition to ensemble
                performances_temp = np.zeros((n_iter, n_classifiers))
                for n_crossval in range(0, n_iter):
                    # For each estimator, add the score to the ensemble and new ensemble performance
                    for n_estimator in range(0, n_classifiers):
                        if iteration == 0:
                            # No y_score yet, so we need to build it instead of stacking
                            y_valid_score_new = Y_valid_score[n_crossval][n_estimator, :]
                        else:
                            # Stack scores of added model on top of previous scores and average
                            y_valid_score_new = np.mean(np.vstack((y_score[n_crossval], Y_valid_score[n_crossval][n_estimator, :])), axis=0)

                        perf = compute_performance(scoring, Y_valid_truth[n_crossval], y_valid_score_new)
                        performances_temp[n_crossval, n_estimator] = perf

                # Average performances over crossval
                performances_temp = list(np.mean(performances_temp, axis=0))

                # Check which ensemble should be in the ensemble to maximally improve
                new_performance = max(performances_temp)
                best_index = performances_temp.index(new_performance)
                iteration += 1

            # Print the performance gain
            print(f"Ensembling best {scoring}: {best_performance}.")
            print(f"Single estimator best {scoring}: {single_estimator_performance}.")
            print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')
        else:
            print(f'[WORC WARNING] No valid ensemble method given: {method}. Not ensembling')
            return self

        # Create the ensemble --------------------------------------------------
        train = np.arange(0, len(X_train))
        if self.fitted_workflows:
            # Simply select the required estimators
            print('\t - Detected already fitted workflows.')
            estimators = list()
            for i in ensemble:
                try:
                    # Try a prediction to see if estimator is truly fitted
                    self.fitted_workflows[i].predict(np.asarray([X_train[0][0], X_train[1][0]]))
                    estimators.append(self.fitted_workflows[i])
                except (NotFittedError, ValueError):
                    print(f'\t\t - Estimator {i} not fitted (correctly) yet, refit.')
                    estimator = self.fitted_workflows[i]
                    estimator.refit_and_score(X_train, Y_train,
                                              parameters_all[i],
                                              train, train,
                                              verbose=False)
                    estimators.append(estimator)
        else:
            # Create the ensemble trained on the full training set
            parameters_all = [parameters_all[i] for i in ensemble]
            estimators = list()
            nest = len(ensemble)
            for enum, p_all in enumerate(parameters_all):
                # Refit a SearchCV object with the provided parameters
                print(f"Refitting estimator {enum+1} / {nest}.")
                base_estimator = clone(base_estimator)

                # # Check if we need to create a multiclass estimator
                base_estimator.refit_and_score(X_train, Y_train, p_all,
                                               train, train,
                                               verbose=False)

                # Determine whether to overfit the feature scaling on the test set
                base_estimator.overfit_scaler = overfit_scaler

                estimators.append(base_estimator)

        self.ensemble = Ensemble(estimators)
        self.best_estimator_ = self.ensemble
        print("\n")


[docs]class BaseSearchCVfastr(BaseSearchCV):
    """Base class for hyper parameter search with cross-validation."""

    def _fit(self, X, y, groups, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet']
        isclassifier =\
            not any(clf in regressors for clf in self.param_distributions['classifiers'])

        # Check the cross-validation object and do the splitting
        cv = check_cv(self.cv, y, classifier=isclassifier)

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print(f"Fitting {n_splits} folds for each of {n_candidates} candidates, totalling {n_candidates * n_splits} fits.")

        cv_iter = list(cv.split(X, y, groups))

        # NOTE: We do not check the scoring here, as this can differ
        # per estimator. Thus, this is done inside the fit and scoring

        # Check fitting parameters
        fit_params = _check_fit_params(X, self.fit_params)

        # Create temporary directory for fastr
        if DebugDetector().do_detection():
            # Specific name for easy debugging
            debugnum = 0
            name = 'DEBUG_' + str(debugnum)
            tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name)
            while os.path.exists(tempfolder):
                debugnum += 1
                name = 'DEBUG_' + str(debugnum)
                tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name)

        else:
            name = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10))

        tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name)
        if not os.path.exists(tempfolder):
            os.makedirs(tempfolder)

        # Draw parameter sample
        for num, parameters in enumerate(parameter_iterable):
            parameter_sample = parameters
            break

        # Preprocess features if required
        if 'FeatPreProcess' in parameter_sample:
            if parameter_sample['FeatPreProcess'] == 'True':
                print("Preprocessing features.")
                feature_values = np.asarray([x[0] for x in X])
                feature_labels = np.asarray([x[1] for x in X])
                preprocessor = Preprocessor(verbose=False)
                preprocessor.fit(feature_values, feature_labels=feature_labels[0, :])
                feature_values = preprocessor.transform(feature_values)
                feature_labels = preprocessor.transform(feature_labels)
                X = [(values, labels) for values, labels in zip(feature_values, feature_labels)]

        # Create the parameter files
        parameters_temp = dict()
        try:
            for num, parameters in enumerate(parameter_iterable):
                parameters["Number"] = str(num)
                parameters_temp[str(num)] = parameters

        except ValueError:
            # One of the parameters gives an error. Find out which one.
            param_grid = dict()
            for k, v in parameter_iterable.param_distributions.iteritems():
                param_grid[k] = v
                sampled_params = ParameterSampler(param_grid, 5)
                try:
                    for num, parameters in enumerate(sampled_params):
                        # Dummy operation
                        a = 1
                except ValueError:
                    break

            message = 'One or more of the values in your parameter sampler ' +\
                      'is either not iterable, or the distribution cannot ' +\
                      'generate valid samples. Please check your  ' +\
                      f' parameters. At least {k} gives an error.'
            raise WORCexceptions.WORCValueError(message)

        # Split the parameters files in equal parts
        keys = list(parameters_temp.keys())
        keys = chunks(keys, self.n_jobspercore)
        parameter_files = dict()
        for num, k in enumerate(keys):
            temp_dict = dict()
            for number in k:
                temp_dict[number] = parameters_temp[number]

            fname = f'settings_{num}.json'
            sourcename = os.path.join(tempfolder, 'parameters', fname)
            if not os.path.exists(os.path.dirname(sourcename)):
                os.makedirs(os.path.dirname(sourcename))
            with open(sourcename, 'w') as fp:
                json.dump(temp_dict, fp, indent=4)

            parameter_files[str(num).zfill(4)] =\
                f'vfs://tmp/GS/{name}/parameters/{fname}'

        # Create test-train splits
        traintest_files = dict()
        # TODO: ugly nummering solution
        num = 0
        for train, test in cv_iter:
            source_labels = ['train', 'test']

            source_data = pd.Series([train, test],
                                    index=source_labels,
                                    name='Train-test data')

            fname = f'traintest_{num}.hdf5'
            sourcename = os.path.join(tempfolder, 'traintest', fname)
            if not os.path.exists(os.path.dirname(sourcename)):
                os.makedirs(os.path.dirname(sourcename))
            traintest_files[str(num).zfill(4)] = f'vfs://tmp/GS/{name}/traintest/{fname}'

            sourcelabel = f"Source Data Iteration {num}"
            source_data.to_hdf(sourcename, sourcelabel)

            num += 1

        # Create the files containing the estimator and settings
        estimator_labels = ['X', 'y', 'scoring',
                            'verbose', 'fit_params', 'return_train_score',
                            'return_n_test_samples',
                            'return_times', 'return_parameters',
                            'return_estimator',
                            'error_score', 'return_all', 'refit_workflows']

        verbose = False
        return_n_test_samples = True
        return_times = True
        return_parameters = False
        return_estimator = False
        return_all = False
        estimator_data = pd.Series([X, y, self.scoring,
                                    verbose, fit_params,
                                    self.return_train_score,
                                    return_n_test_samples, return_times,
                                    return_parameters,
                                    return_estimator,
                                    self.error_score,
                                    return_all, self.refit_workflows],
                                   index=estimator_labels,
                                   name='estimator Data')
        fname = 'estimatordata.hdf5'
        estimatorname = os.path.join(tempfolder, fname)
        estimator_data.to_hdf(estimatorname, 'Estimator Data')

        estimatordata = f"vfs://tmp/GS/{name}/{fname}"

        # Create the fastr network
        network = fastr.create_network('WORC_GridSearch_' + name)
        estimator_data = network.create_source('HDF5', id='estimator_source')
        traintest_data = network.create_source('HDF5', id='traintest')
        parameter_data = network.create_source('JsonFile', id='parameters')
        sink_output = network.create_sink('HDF5', id='output')

        fitandscore =\
            network.create_node('worc/fitandscore:1.0',
                                tool_version='1.0',
                                id='fitandscore',
                                resources=ResourceLimit(memory=self.memory))

        fitandscore.inputs['estimatordata'].input_group = 'estimator'
        fitandscore.inputs['traintest'].input_group = 'traintest'
        fitandscore.inputs['parameters'].input_group = 'parameters'

        fitandscore.inputs['estimatordata'] = estimator_data.output
        fitandscore.inputs['traintest'] = traintest_data.output
        fitandscore.inputs['parameters'] = parameter_data.output
        sink_output.input = fitandscore.outputs['fittedestimator']

        source_data = {'estimator_source': estimatordata,
                       'traintest': traintest_files,
                       'parameters': parameter_files}
        sink_data = {'output': f"vfs://tmp/GS/{name}/output_{{sample_id}}_{{cardinality}}{{ext}}"}

        network.execute(source_data, sink_data,
                        tmpdir=os.path.join(tempfolder, 'tmp'),
                        execution_plugin=self.fastr_plugin)

        # Check whether all jobs have finished
        expected_no_files = len(list(traintest_files.keys())) * len(list(parameter_files.keys()))
        sink_files = glob.glob(os.path.join(fastr.config.mounts['tmp'], 'GS', name) + '/output*.hdf5')
        sink_files.sort()
        if len(sink_files) != expected_no_files:
            difference = expected_no_files - len(sink_files)
            fname = os.path.join(tempfolder, 'tmp')
            message = ('Fitting classifiers has failed for ' +
                       f'{difference} / {expected_no_files} files. The temporary ' +
                       f'results where not deleted and can be found in {tempfolder}. ' +
                       'Probably your fitting and scoring failed: check out ' +
                       'the tmp/fitandscore folder within the tempfolder for ' +
                       'the fastr job temporary results or run: fastr trace ' +
                       f'"{fname}{os.path.sep}__sink_data__.json" --samples.')
            raise WORCexceptions.WORCValueError(message)

        # Read in the output data once finished
        save_data = list()
        for output in sink_files:
            data = pd.read_hdf(output)
            save_data.extend(list(data['RET']))

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            if self.refit_workflows:
                (train_scores, test_scores, test_sample_counts,
                 fit_time, score_time, parameters_all, fitted_workflows) =\
                  zip(*save_data)
            else:
                fitted_workflows = None
                (train_scores, test_scores, test_sample_counts,
                 fit_time, score_time, parameters_all) =\
                    zip(*save_data)
        else:
            if self.refit_workflows:
                (test_scores, test_sample_counts,
                 fit_time, score_time, parameters_all, fitted_workflows) =\
                  zip(*save_data)
            else:
                fitted_workflows = None
                (test_scores, test_sample_counts,
                 fit_time, score_time, parameters_all) =\
                    zip(*save_data)

        # Remove the temporary folder used
        if name != 'DEBUG_0':
            # Do delete if not debugging for first iteration
            shutil.rmtree(tempfolder)

        # Process the results of the fitting procedure
        self.process_fit(n_splits=n_splits,
                         parameters_all=parameters_all,
                         test_sample_counts=test_sample_counts,
                         test_score_dicts=test_scores,
                         train_score_dicts=train_scores,
                         fit_time=fit_time,
                         score_time=score_time,
                         cv_iter=cv_iter,
                         X=X, y=y,
                         fitted_workflows=fitted_workflows)


[docs]class RandomizedSearchCVfastr(BaseSearchCVfastr):
    """Randomized search on hyper parameters.

    RandomizedSearchCV implements a "fit" and a "score" method.
    It also implements "predict", "predict_proba", "decision_function",
    "transform" and "inverse_transform" if they are implemented in the
    estimator used.

    The parameters of the estimator used to apply these methods are optimized
    by cross-validated search over parameter settings.

    In contrast to GridSearchCV, not all parameter values are tried out, but
    rather a fixed number of parameter settings is sampled from the specified
    distributions. The number of parameter settings that are tried is
    given by n_iter.

    If all parameters are presented as a list,
    sampling without replacement is performed. If at least one parameter
    is given as a distribution, sampling with replacement is used.
    It is highly recommended to use continuous distributions for continuous
    parameters.

    Read more in the sklearn user guide.

    Parameters
    ----------
    estimator : estimator object.
        A object of that type is instantiated for each grid point.
        This is assumed to implement the scikit-learn estimator interface.
        Either estimator needs to provide a ``score`` function,
        or ``scoring`` must be passed.

    param_distributions : dict
        Dictionary with parameters names (string) as keys and distributions
        or lists of parameters to try. Distributions must provide a ``rvs``
        method for sampling (such as those from scipy.stats.distributions).
        If a list is given, it is sampled uniformly.

    n_iter : int, default=10
        Number of parameter settings that are sampled. n_iter trades
        off runtime vs quality of the solution.

    scoring : string, callable or None, default=None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
        If ``None``, the ``score`` method of the estimator is used.

    fit_params : dict, optional
        Parameters to pass to the fit method.

    n_jobs : int, default=1
        Number of jobs to run in parallel.

    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    iid : boolean, default=True
        If True, the data is assumed to be identically distributed across
        the folds, and the loss minimized is the total loss per sample,
        and not the mean loss across the folds.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used.

        Refer the sklearn user guide for the various
        cross-validation strategies that can be used here.

    refit : boolean, default=True
        Refit the best estimator with the entire dataset.
        If "False", it is impossible to make predictions using
        this RandomizedSearchCV instance after fitting.

    verbose : integer
        Controls the verbosity: the higher, the more messages.

    random_state : int or RandomState
        Pseudo random number generator state used for random uniform sampling
        from lists of possible values instead of scipy.stats distributions.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    return_train_score : boolean, default=True
        If ``'False'``, the ``cv_results_`` attribute will not include training
        scores.

    Attributes
    ----------
    cv_results_ : dict of numpy (masked) ndarrays
        A dict with keys as column headers and values as columns, that can be
        imported into a pandas ``DataFrame``.

        For instance the below given table

        +--------------+-------------+-------------------+---+---------------+
        | param_kernel | param_gamma | split0_test_score |...|rank_test_score|
        +==============+=============+===================+===+===============+
        |    'rbf'     |     0.1     |        0.8        |...|       2       |
        +--------------+-------------+-------------------+---+---------------+
        |    'rbf'     |     0.2     |        0.9        |...|       1       |
        +--------------+-------------+-------------------+---+---------------+
        |    'rbf'     |     0.3     |        0.7        |...|       1       |
        +--------------+-------------+-------------------+---+---------------+

        will be represented by a ``cv_results_`` dict of::

            {
            'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'],
                                          mask = False),
            'param_gamma'  : masked_array(data = [0.1 0.2 0.3], mask = False),
            'split0_test_score'  : [0.8, 0.9, 0.7],
            'split1_test_score'  : [0.82, 0.5, 0.7],
            'mean_test_score'    : [0.81, 0.7, 0.7],
            'std_test_score'     : [0.02, 0.2, 0.],
            'rank_test_score'    : [3, 1, 1],
            'split0_train_score' : [0.8, 0.9, 0.7],
            'split1_train_score' : [0.82, 0.5, 0.7],
            'mean_train_score'   : [0.81, 0.7, 0.7],
            'std_train_score'    : [0.03, 0.03, 0.04],
            'mean_fit_time'      : [0.73, 0.63, 0.43, 0.49],
            'std_fit_time'       : [0.01, 0.02, 0.01, 0.01],
            'mean_score_time'    : [0.007, 0.06, 0.04, 0.04],
            'std_score_time'     : [0.001, 0.002, 0.003, 0.005],
            'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],
            }

        NOTE that the key ``'params'`` is used to store a list of parameter
        settings dict for all the parameter candidates.

        The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
        ``std_score_time`` are all in seconds.

    best_estimator_ : estimator
        Estimator that was chosen by the search, i.e. estimator
        which gave highest score (or smallest loss if specified)
        on the left out data. Not available if refit=False.

    best_score_ : float
        Score of best_estimator on the left out data.

    best_params_ : dict
        Parameter setting that gave the best results on the hold out data.

    best_index_ : int
        The index (of the ``cv_results_`` arrays) which corresponds to the best
        candidate parameter setting.

        The dict at ``search.cv_results_['params'][search.best_index_]`` gives
        the parameter setting for the best model, that gives the highest
        mean score (``search.best_score_``).

    scorer_ : function
        Scorer function used on the held out data to choose the best
        parameters for the model.

    n_splits_ : int
        The number of cross-validation splits (folds/iterations).

    Notes
    -----
    The parameters selected are those that maximize the score of the held-out
    data, according to the scoring parameter.

    If `n_jobs` was set to a value higher than one, the data is copied for each
    parameter setting(and not `n_jobs` times). This is done for efficiency
    reasons if individual jobs take very little time, but may raise errors if
    the dataset is large and not enough memory is available.  A workaround in
    this case is to set `pre_dispatch`. Then, the memory is copied only
    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
    n_jobs`.

    See Also
    --------
    :class:`GridSearchCV`:
        Does exhaustive search over a grid of parameters.

    :class:`ParameterSampler`:
        A generator over parameter settings, constructed from
        param_distributions.

    """

[docs]    def __init__(self, param_distributions={}, n_iter=10, scoring=None,
                 fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
                 verbose=0, pre_dispatch='2*n_jobs', random_state=None,
                 error_score='raise', return_train_score=True,
                 n_jobspercore=100, fastr_plugin=None, memory='2G', maxlen=100,
                 ranking_score='test_score', refit_workflows=False):
        super(RandomizedSearchCVfastr, self).__init__(
             param_distributions=param_distributions, scoring=scoring, fit_params=fit_params,
             n_iter=n_iter, random_state=random_state, n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
             pre_dispatch=pre_dispatch, error_score=error_score,
             return_train_score=return_train_score,
             n_jobspercore=n_jobspercore, fastr_plugin=fastr_plugin,
             memory=memory, maxlen=maxlen, ranking_score=ranking_score,
             refit_workflows=refit_workflows)

[docs]    def fit(self, X, y=None, groups=None):
        """Randomized model selection and hyperparameter search.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        """
        print("Fit: " + str(self.n_iter))
        sampled_params = ParameterSampler(self.param_distributions,
                                          self.n_iter,
                                          random_state=self.random_state)
        return self._fit(X, y, groups, sampled_params)


[docs]class BaseSearchCVJoblib(BaseSearchCV):
    """Base class for hyper parameter search with cross-validation."""

    def _fit(self, X, y, groups, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""

        regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet']
        isclassifier =\
            not any(clf in regressors for clf in self.param_distributions['classifiers'])

        # Check the cross-validation object and do the splitting
        cv = check_cv(self.cv, y, classifier=isclassifier)

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print(f"Fitting {n_splits} folds for each of {n_candidates}" +\
                  " candidates, totalling" +\
                  " {n_candidates * n_splits} fits")

        pre_dispatch = self.pre_dispatch
        cv_iter = list(cv.split(X, y, groups))

        # Check fitting parameters
        fit_params = _check_fit_params(X, self.fit_params)

        # Draw parameter sample
        for num, parameters in enumerate(parameter_iterable):
            parameter_sample = parameters
            break

        # Preprocess features if required
        if 'FeatPreProcess' in parameter_sample:
            if parameter_sample['FeatPreProcess'] == 'True':
                print("Preprocessing features.")
                feature_values = np.asarray([x[0] for x in X])
                feature_labels = np.asarray([x[1] for x in X])
                preprocessor = Preprocessor(verbose=False)
                preprocessor.fit(feature_values, feature_labels=feature_labels[0, :])
                feature_values = preprocessor.transform(feature_values)
                feature_labels = preprocessor.transform(feature_labels)
                X = [(values, labels) for values, labels in zip(feature_values, feature_labels)]

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(delayed(fit_and_score)(X, y, self.scoring,
                                 train, test, parameters,
                                 fit_params=fit_params,
                                 return_train_score=self.return_train_score,
                                 return_n_test_samples=True,
                                 return_times=True, return_parameters=False,
                                 return_estimator=False,
                                 error_score=self.error_score,
                                 verbose=False,
                                 return_all=False)
          for parameters in parameter_iterable
          for train, test in cv_iter)
        save_data = zip(*out)

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_scores, test_scores, test_sample_counts,
             fit_time, score_time, parameters_all) =\
              save_data
        else:
            (test_scores, test_sample_counts,
             fit_time, score_time, parameters_all) =\
              save_data

        self.process_fit(n_splits=n_splits,
                         parameters_all=parameters_all,
                         test_sample_counts=test_sample_counts,
                         test_score_dicts=test_scores,
                         train_score_dicts=train_scores,
                         fit_time=fit_time,
                         score_time=score_time,
                         cv_iter=cv_iter,
                         X=X, y=y)

        return self


[docs]class GridSearchCVfastr(BaseSearchCVfastr):
    """Exhaustive search over specified parameter values for an estimator.

    Important members are fit, predict.

    GridSearchCV implements a "fit" and a "score" method.
    It also implements "predict", "predict_proba", "decision_function",
    "transform" and "inverse_transform" if they are implemented in the
    estimator used.

    The parameters of the estimator used to apply these methods are optimized
    by cross-validated grid-search over a parameter grid.

    Read more in the sklearn user guide.

    Parameters
    ----------
    estimator : estimator object.
        This is assumed to implement the scikit-learn estimator interface.
        Either estimator needs to provide a ``score`` function,
        or ``scoring`` must be passed.

    param_grid : dict or list of dictionaries
        Dictionary with parameters names (string) as keys and lists of
        parameter settings to try as values, or a list of such
        dictionaries, in which case the grids spanned by each dictionary
        in the list are explored. This enables searching over any sequence
        of parameter settings.

    scoring : string, callable or None, default=None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
        If ``None``, the ``score`` method of the estimator is used.

    fit_params : dict, optional
        Parameters to pass to the fit method.

    n_jobs : int, default=1
        Number of jobs to run in parallel.

    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    iid : boolean, default=True
        If True, the data is assumed to be identically distributed across
        the folds, and the loss minimized is the total loss per sample,
        and not the mean loss across the folds.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used.

        Refer the sklearn user guide for the various
        cross-validation strategies that can be used here.

    refit : boolean, default=True
        Refit the best estimator with the entire dataset.
        If "False", it is impossible to make predictions using
        this GridSearchCV instance after fitting.

    verbose : integer
        Controls the verbosity: the higher, the more messages.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    return_train_score : boolean, default=True
        If ``'False'``, the ``cv_results_`` attribute will not include training
        scores.


    Examples
    --------
    >>> from sklearn import svm, datasets
    >>> from sklearn.model_selection import GridSearchCV
    >>> iris = datasets.load_iris()
    >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
    >>> svr = svm.SVC()
    >>> clf = GridSearchCV(svr, parameters)
    >>> clf.fit(iris.data, iris.target)
    ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
    GridSearchCV(cv=None, error_score=...,
           estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
                         decision_function_shape=None, degree=..., gamma=...,
                         kernel='rbf', max_iter=-1, probability=False,
                         random_state=None, shrinking=True, tol=...,
                         verbose=False),
           fit_params={}, iid=..., n_jobs=1,
           param_grid=..., pre_dispatch=..., refit=..., return_train_score=...,
           scoring=..., verbose=...)
    >>> sorted(clf.cv_results_.keys())
    ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
    ['mean_fit_time', 'mean_score_time', 'mean_test_score',...
     'mean_train_score', 'param_C', 'param_kernel', 'params',...
     'rank_test_score', 'split0_test_score',...
     'split0_train_score', 'split1_test_score', 'split1_train_score',...
     'split2_test_score', 'split2_train_score',...
     'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score'...]

    Attributes
    ----------
    cv_results_ : dict of numpy (masked) ndarrays
        A dict with keys as column headers and values as columns, that can be
        imported into a pandas ``DataFrame``.

        For instance the below given table

        +------------+-----------+------------+-----------------+---+---------+
        |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_....|
        +============+===========+============+=================+===+=========+
        |  'poly'    |     --    |      2     |        0.8      |...|    2    |
        +------------+-----------+------------+-----------------+---+---------+
        |  'poly'    |     --    |      3     |        0.7      |...|    4    |
        +------------+-----------+------------+-----------------+---+---------+
        |  'rbf'     |     0.1   |     --     |        0.8      |...|    3    |
        +------------+-----------+------------+-----------------+---+---------+
        |  'rbf'     |     0.2   |     --     |        0.9      |...|    1    |
        +------------+-----------+------------+-----------------+---+---------+

        will be represented by a ``cv_results_`` dict of::

            {
            'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
                                         mask = [False False False False]...)
            'param_gamma': masked_array(data = [-- -- 0.1 0.2],
                                        mask = [ True  True False False]...),
            'param_degree': masked_array(data = [2.0 3.0 -- --],
                                         mask = [False False  True  True]...),
            'split0_test_score'  : [0.8, 0.7, 0.8, 0.9],
            'split1_test_score'  : [0.82, 0.5, 0.7, 0.78],
            'mean_test_score'    : [0.81, 0.60, 0.75, 0.82],
            'std_test_score'     : [0.02, 0.01, 0.03, 0.03],
            'rank_test_score'    : [2, 4, 3, 1],
            'split0_train_score' : [0.8, 0.9, 0.7],
            'split1_train_score' : [0.82, 0.5, 0.7],
            'mean_train_score'   : [0.81, 0.7, 0.7],
            'std_train_score'    : [0.03, 0.03, 0.04],
            'mean_fit_time'      : [0.73, 0.63, 0.43, 0.49],
            'std_fit_time'       : [0.01, 0.02, 0.01, 0.01],
            'mean_score_time'    : [0.007, 0.06, 0.04, 0.04],
            'std_score_time'     : [0.001, 0.002, 0.003, 0.005],
            'params'             : [{'kernel': 'poly', 'degree': 2}, ...],
            }

        NOTE that the key ``'params'`` is used to store a list of parameter
        settings dict for all the parameter candidates.

        The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
        ``std_score_time`` are all in seconds.

    best_estimator_ : estimator
        Estimator that was chosen by the search, i.e. estimator
        which gave highest score (or smallest loss if specified)
        on the left out data. Not available if refit=False.

    best_score_ : float
        Score of best_estimator on the left out data.

    best_params_ : dict
        Parameter setting that gave the best results on the hold out data.

    best_index_ : int
        The index (of the ``cv_results_`` arrays) which corresponds to the best
        candidate parameter setting.

        The dict at ``search.cv_results_['params'][search.best_index_]`` gives
        the parameter setting for the best model, that gives the highest
        mean score (``search.best_score_``).

    scorer_ : function
        Scorer function used on the held out data to choose the best
        parameters for the model.

    n_splits_ : int
        The number of cross-validation splits (folds/iterations).

    Notes
    ------
    The parameters selected are those that maximize the score of the left out
    data, unless an explicit score is passed in which case it is used instead.

    If `n_jobs` was set to a value higher than one, the data is copied for each
    point in the grid (and not `n_jobs` times). This is done for efficiency
    reasons if individual jobs take very little time, but may raise errors if
    the dataset is large and not enough memory is available.  A workaround in
    this case is to set `pre_dispatch`. Then, the memory is copied only
    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
    n_jobs`.

    See Also
    ---------
    :class:`ParameterGrid`:
        generates all the combinations of a hyperparameter grid.

    :func:`sklearn.model_selection.train_test_split`:
        utility function to split the data into a development set usable
        for fitting a GridSearchCV instance and an evaluation set for
        its final evaluation.

    :func:`sklearn.metrics.make_scorer`:
        Make a scorer from a performance metric or loss function.

    """

[docs]    def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
                 n_jobs=1, iid=True, refit=True, cv=None, verbose=0,
                 pre_dispatch='2*n_jobs', error_score='raise',
                 return_train_score=True):
        super(GridSearchCVfastr, self).__init__(
            scoring=scoring, fit_params=fit_params,
            n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
            pre_dispatch=pre_dispatch, error_score=error_score,
            return_train_score=return_train_score, fastr_plugin=None,
            memory='2G')
        self.param_grid = param_grid
        _check_param_grid(param_grid)

[docs]    def fit(self, X, y=None, groups=None):
        """Run fit with all sets of parameters.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        """
        return self._fit(X, y, groups, ParameterGrid(self.param_grid))


[docs]class RandomizedSearchCVJoblib(BaseSearchCVJoblib):
    """Randomized search on hyper parameters.

    RandomizedSearchCV implements a "fit" and a "score" method.
    It also implements "predict", "predict_proba", "decision_function",
    "transform" and "inverse_transform" if they are implemented in the
    estimator used.

    The parameters of the estimator used to apply these methods are optimized
    by cross-validated search over parameter settings.

    In contrast to GridSearchCV, not all parameter values are tried out, but
    rather a fixed number of parameter settings is sampled from the specified
    distributions. The number of parameter settings that are tried is
    given by n_iter.

    If all parameters are presented as a list,
    sampling without replacement is performed. If at least one parameter
    is given as a distribution, sampling with replacement is used.
    It is highly recommended to use continuous distributions for continuous
    parameters.

    Read more in the sklearn user guide.

    Parameters
    ----------
    estimator : estimator object.
        A object of that type is instantiated for each grid point.
        This is assumed to implement the scikit-learn estimator interface.
        Either estimator needs to provide a ``score`` function,
        or ``scoring`` must be passed.

    param_distributions : dict
        Dictionary with parameters names (string) as keys and distributions
        or lists of parameters to try. Distributions must provide a ``rvs``
        method for sampling (such as those from scipy.stats.distributions).
        If a list is given, it is sampled uniformly.

    n_iter : int, default=10
        Number of parameter settings that are sampled. n_iter trades
        off runtime vs quality of the solution.

    scoring : string, callable or None, default=None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
        If ``None``, the ``score`` method of the estimator is used.

    fit_params : dict, optional
        Parameters to pass to the fit method.

    n_jobs : int, default=1
        Number of jobs to run in parallel.

    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    iid : boolean, default=True
        If True, the data is assumed to be identically distributed across
        the folds, and the loss minimized is the total loss per sample,
        and not the mean loss across the folds.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used.

        Refer sklearn user guide for the various
        cross-validation strategies that can be used here.

    refit : boolean, default=True
        Refit the best estimator with the entire dataset.
        If "False", it is impossible to make predictions using
        this RandomizedSearchCV instance after fitting.

    verbose : integer
        Controls the verbosity: the higher, the more messages.

    random_state : int or RandomState
        Pseudo random number generator state used for random uniform sampling
        from lists of possible values instead of scipy.stats distributions.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    return_train_score : boolean, default=True
        If ``'False'``, the ``cv_results_`` attribute will not include training
        scores.

    Attributes
    ----------
    cv_results_ : dict of numpy (masked) ndarrays
        A dict with keys as column headers and values as columns, that can be
        imported into a pandas ``DataFrame``.

        For instance the below given table

        +--------------+-------------+-------------------+---+---------------+
        | param_kernel | param_gamma | split0_test_score |...|rank_test_score|
        +==============+=============+===================+===+===============+
        |    'rbf'     |     0.1     |        0.8        |...|       2       |
        +--------------+-------------+-------------------+---+---------------+
        |    'rbf'     |     0.2     |        0.9        |...|       1       |
        +--------------+-------------+-------------------+---+---------------+
        |    'rbf'     |     0.3     |        0.7        |...|       1       |
        +--------------+-------------+-------------------+---+---------------+

        will be represented by a ``cv_results_`` dict of::

            {
            'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'],
                                          mask = False),
            'param_gamma'  : masked_array(data = [0.1 0.2 0.3], mask = False),
            'split0_test_score'  : [0.8, 0.9, 0.7],
            'split1_test_score'  : [0.82, 0.5, 0.7],
            'mean_test_score'    : [0.81, 0.7, 0.7],
            'std_test_score'     : [0.02, 0.2, 0.],
            'rank_test_score'    : [3, 1, 1],
            'split0_train_score' : [0.8, 0.9, 0.7],
            'split1_train_score' : [0.82, 0.5, 0.7],
            'mean_train_score'   : [0.81, 0.7, 0.7],
            'std_train_score'    : [0.03, 0.03, 0.04],
            'mean_fit_time'      : [0.73, 0.63, 0.43, 0.49],
            'std_fit_time'       : [0.01, 0.02, 0.01, 0.01],
            'mean_score_time'    : [0.007, 0.06, 0.04, 0.04],
            'std_score_time'     : [0.001, 0.002, 0.003, 0.005],
            'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],
            }

        NOTE that the key ``'params'`` is used to store a list of parameter
        settings dict for all the parameter candidates.

        The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
        ``std_score_time`` are all in seconds.

    best_estimator_ : estimator
        Estimator that was chosen by the search, i.e. estimator
        which gave highest score (or smallest loss if specified)
        on the left out data. Not available if refit=False.

    best_score_ : float
        Score of best_estimator on the left out data.

    best_params_ : dict
        Parameter setting that gave the best results on the hold out data.

    best_index_ : int
        The index (of the ``cv_results_`` arrays) which corresponds to the best
        candidate parameter setting.

        The dict at ``search.cv_results_['params'][search.best_index_]`` gives
        the parameter setting for the best model, that gives the highest
        mean score (``search.best_score_``).

    scorer_ : function
        Scorer function used on the held out data to choose the best
        parameters for the model.

    n_splits_ : int
        The number of cross-validation splits (folds/iterations).

    Notes
    -----
    The parameters selected are those that maximize the score of the held-out
    data, according to the scoring parameter.

    If `n_jobs` was set to a value higher than one, the data is copied for each
    parameter setting(and not `n_jobs` times). This is done for efficiency
    reasons if individual jobs take very little time, but may raise errors if
    the dataset is large and not enough memory is available.  A workaround in
    this case is to set `pre_dispatch`. Then, the memory is copied only
    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
    n_jobs`.

    See Also
    --------
    :class:`GridSearchCV`:
        Does exhaustive search over a grid of parameters.

    :class:`ParameterSampler`:
        A generator over parameter settins, constructed from
        param_distributions.

    """

[docs]    def __init__(self, param_distributions={}, n_iter=10, scoring=None,
                 fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
                 verbose=0, pre_dispatch='2*n_jobs', random_state=None,
                 error_score='raise', return_train_score=True,
                 n_jobspercore=100, maxlen=100, ranking_score='test_score'):
        super(RandomizedSearchCVJoblib, self).__init__(
             param_distributions=param_distributions,
             n_iter=n_iter, scoring=scoring, fit_params=fit_params,
             n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
             pre_dispatch=pre_dispatch, error_score=error_score,
             return_train_score=return_train_score,
             n_jobspercore=n_jobspercore, random_state=random_state,
             maxlen=maxlen, ranking_score=ranking_score)

[docs]    def fit(self, X, y=None, groups=None):
        """Run fit on the estimator with randomly drawn parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        """
        sampled_params = ParameterSampler(self.param_distributions,
                                          self.n_iter,
                                          random_state=self.random_state)
        return self._fit(X, y, groups, sampled_params)


[docs]class GridSearchCVJoblib(BaseSearchCVJoblib):
    """Exhaustive search over specified parameter values for an estimator.

    Important members are fit, predict.

    GridSearchCV implements a "fit" and a "score" method.
    It also implements "predict", "predict_proba", "decision_function",
    "transform" and "inverse_transform" if they are implemented in the
    estimator used.

    The parameters of the estimator used to apply these methods are optimized
    by cross-validated grid-search over a parameter grid.

    Read more in the sklearn user guide.

    Parameters
    ----------
    estimator : estimator object.
        This is assumed to implement the scikit-learn estimator interface.
        Either estimator needs to provide a ``score`` function,
        or ``scoring`` must be passed.

    param_grid : dict or list of dictionaries
        Dictionary with parameters names (string) as keys and lists of
        parameter settings to try as values, or a list of such
        dictionaries, in which case the grids spanned by each dictionary
        in the list are explored. This enables searching over any sequence
        of parameter settings.

    scoring : string, callable or None, default=None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
        If ``None``, the ``score`` method of the estimator is used.

    fit_params : dict, optional
        Parameters to pass to the fit method.

    n_jobs : int, default=1
        Number of jobs to run in parallel.

    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    iid : boolean, default=True
        If True, the data is assumed to be identically distributed across
        the folds, and the loss minimized is the total loss per sample,
        and not the mean loss across the folds.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used.

        Refer sklearn user guide for the various
        cross-validation strategies that can be used here.

    refit : boolean, default=True
        Refit the best estimator with the entire dataset.
        If "False", it is impossible to make predictions using
        this GridSearchCV instance after fitting.

    verbose : integer
        Controls the verbosity: the higher, the more messages.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    return_train_score : boolean, default=True
        If ``'False'``, the ``cv_results_`` attribute will not include training
        scores.


    Examples
    --------
    >>> from sklearn import svm, datasets
    >>> from sklearn.model_selection import GridSearchCV
    >>> iris = datasets.load_iris()
    >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
    >>> svr = svm.SVC()
    >>> clf = GridSearchCV(svr, parameters)
    >>> clf.fit(iris.data, iris.target)
    ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
    GridSearchCV(cv=None, error_score=...,
           estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
                         decision_function_shape=None, degree=..., gamma=...,
                         kernel='rbf', max_iter=-1, probability=False,
                         random_state=None, shrinking=True, tol=...,
                         verbose=False),
           fit_params={}, iid=..., n_jobs=1,
           param_grid=..., pre_dispatch=..., refit=..., return_train_score=...,
           scoring=..., verbose=...)
    >>> sorted(clf.cv_results_.keys())
    ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
    ['mean_fit_time', 'mean_score_time', 'mean_test_score',...
     'mean_train_score', 'param_C', 'param_kernel', 'params',...
     'rank_test_score', 'split0_test_score',...
     'split0_train_score', 'split1_test_score', 'split1_train_score',...
     'split2_test_score', 'split2_train_score',...
     'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score'...]

    Attributes
    ----------
    cv_results_ : dict of numpy (masked) ndarrays
        A dict with keys as column headers and values as columns, that can be
        imported into a pandas ``DataFrame``.

        For instance the below given table

        +------------+-----------+------------+-----------------+---+---------+
        |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_....|
        +============+===========+============+=================+===+=========+
        |  'poly'    |     --    |      2     |        0.8      |...|    2    |
        +------------+-----------+------------+-----------------+---+---------+
        |  'poly'    |     --    |      3     |        0.7      |...|    4    |
        +------------+-----------+------------+-----------------+---+---------+
        |  'rbf'     |     0.1   |     --     |        0.8      |...|    3    |
        +------------+-----------+------------+-----------------+---+---------+
        |  'rbf'     |     0.2   |     --     |        0.9      |...|    1    |
        +------------+-----------+------------+-----------------+---+---------+

        will be represented by a ``cv_results_`` dict of::

            {
            'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
                                         mask = [False False False False]...)
            'param_gamma': masked_array(data = [-- -- 0.1 0.2],
                                        mask = [ True  True False False]...),
            'param_degree': masked_array(data = [2.0 3.0 -- --],
                                         mask = [False False  True  True]...),
            'split0_test_score'  : [0.8, 0.7, 0.8, 0.9],
            'split1_test_score'  : [0.82, 0.5, 0.7, 0.78],
            'mean_test_score'    : [0.81, 0.60, 0.75, 0.82],
            'std_test_score'     : [0.02, 0.01, 0.03, 0.03],
            'rank_test_score'    : [2, 4, 3, 1],
            'split0_train_score' : [0.8, 0.9, 0.7],
            'split1_train_score' : [0.82, 0.5, 0.7],
            'mean_train_score'   : [0.81, 0.7, 0.7],
            'std_train_score'    : [0.03, 0.03, 0.04],
            'mean_fit_time'      : [0.73, 0.63, 0.43, 0.49],
            'std_fit_time'       : [0.01, 0.02, 0.01, 0.01],
            'mean_score_time'    : [0.007, 0.06, 0.04, 0.04],
            'std_score_time'     : [0.001, 0.002, 0.003, 0.005],
            'params'             : [{'kernel': 'poly', 'degree': 2}, ...],
            }

        NOTE that the key ``'params'`` is used to store a list of parameter
        settings dict for all the parameter candidates.

        The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
        ``std_score_time`` are all in seconds.

    best_estimator_ : estimator
        Estimator that was chosen by the search, i.e. estimator
        which gave highest score (or smallest loss if specified)
        on the left out data. Not available if refit=False.

    best_score_ : float
        Score of best_estimator on the left out data.

    best_params_ : dict
        Parameter setting that gave the best results on the hold out data.

    best_index_ : int
        The index (of the ``cv_results_`` arrays) which corresponds to the best
        candidate parameter setting.

        The dict at ``search.cv_results_['params'][search.best_index_]`` gives
        the parameter setting for the best model, that gives the highest
        mean score (``search.best_score_``).

    scorer_ : function
        Scorer function used on the held out data to choose the best
        parameters for the model.

    n_splits_ : int
        The number of cross-validation splits (folds/iterations).

    Notes
    ------
    The parameters selected are those that maximize the score of the left out
    data, unless an explicit score is passed in which case it is used instead.

    If `n_jobs` was set to a value higher than one, the data is copied for each
    point in the grid (and not `n_jobs` times). This is done for efficiency
    reasons if individual jobs take very little time, but may raise errors if
    the dataset is large and not enough memory is available.  A workaround in
    this case is to set `pre_dispatch`. Then, the memory is copied only
    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
    n_jobs`.

    See Also
    ---------
    :class:`ParameterGrid`:
        generates all the combinations of a hyperparameter grid.

    :func:`sklearn.model_selection.train_test_split`:
        utility function to split the data into a development set usable
        for fitting a GridSearchCV instance and an evaluation set for
        its final evaluation.

    :func:`sklearn.metrics.make_scorer`:
        Make a scorer from a performance metric or loss function.

    """

[docs]    def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
                 n_jobs=1, iid=True, refit=True, cv=None, verbose=0,
                 pre_dispatch='2*n_jobs', error_score='raise',
                 return_train_score=True):
        super(GridSearchCVJoblib, self).__init__(
            scoring=scoring, fit_params=fit_params,
            n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
            pre_dispatch=pre_dispatch, error_score=error_score,
            return_train_score=return_train_score)
        self.param_grid = param_grid
        _check_param_grid(param_grid)

[docs]    def fit(self, X, y=None, groups=None):
        """Run fit with all sets of parameters.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        """
        return self._fit(X, y, groups, ParameterGrid(self.param_grid))