Source code for WORC.classification.AdvancedSampler

#!/usr/bin/env python

# Copyright 2016-2020 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from sklearn.utils import check_random_state
import numpy as np
import six
from ghalton import Halton
# from sobol_seq import i4_sobol_generate as Sobol
import scipy
from scipy.stats import uniform
import math


[docs]class log_uniform():
[docs]    def __init__(self, loc=-1, scale=0, base=10):
        self.loc = loc
        self.scale = scale
        self.base = base
        self.uniform_dist = uniform(loc=self.loc, scale=self.scale)

[docs]    def rvs(self, size=None, random_state=None):
        if size is None:
            return np.power(self.base, self.uniform_dist.rvs(random_state=random_state))
        else:
            return np.power(self.base, self.uniform_dist.rvs(size=size, random_state=random_state))


[docs]class discrete_uniform():
[docs]    def __init__(self, loc=-1, scale=0):
        self.loc = loc
        self.scale = scale
        self.uniform_dist = uniform(loc=self.loc, scale=self.scale)

[docs]    def rvs(self, size=None, random_state=None):
        if size is None:
            return int(self.uniform_dist.rvs(random_state=random_state))
        else:
            return int(self.uniform_dist.rvs(size=size, random_state=random_state))


[docs]class boolean_uniform():
    '''
    Uniform distribution thresholded at a certain value to output booleans.

    Note: as Booleans cannot be saved in JSOn, which WORC later does, this
    object returns strings.

    '''
[docs]    def __init__(self, loc=0, scale=1, threshold=0.5):
        self.loc = loc
        self.scale = scale
        self.threshold = threshold
        self.uniform_dist = uniform(loc=self.loc, scale=self.scale)

[docs]    def rvs(self, size=None, random_state=None):
        if size is None:
            return str(self.uniform_dist.rvs(random_state=random_state) < self.threshold)
        else:
            return str([k < self.threshold for k in self.uniform_dist.rvs(size=size, random_state=random_state)])


[docs]class exp_uniform():
[docs]    def __init__(self, loc=-1, scale=0, base=math.e):
        self.loc = loc
        self.scale = scale
        self.base = base

[docs]    def rvs(self, size=None, random_state=None):
        uniform_dist = uniform(loc=self.loc, scale=self.scale)
        if size is None:
            return np.power(self.base, uniform_dist .rvs(random_state=random_state))
        else:
            return np.power(self.base, uniform_dist .rvs(size=size, random_state=random_state))


[docs]class AdvancedSampler(object):
    """Generator on parameters sampled from given distributions using
    numerical sequences. Based on the sklearn ParameterSampler.

    Non-deterministic iterable over random candidate combinations for hyper-
    parameter search. If all parameters are presented as a list,
    sampling without replacement is performed. If at least one parameter
    is given as a distribution, sampling with replacement is used.
    It is highly recommended to use continuous distributions for continuous
    parameters.

    Note that before SciPy 0.16, the ``scipy.stats.distributions`` do not
    accept a custom RNG instance and always use the singleton RNG from
    ``numpy.random``. Hence setting ``random_state`` will not guarantee a
    deterministic iteration whenever ``scipy.stats`` distributions are used to
    define the parameter search space. Deterministic behavior is however
    guaranteed from SciPy 0.16 onwards.

    Read more in the :ref:`User Guide <search>`.

    Parameters
    ----------
    param_distributions : dict
        Dictionary where the keys are parameters and values
        are distributions from which a parameter is to be sampled.
        Distributions either have to provide a ``rvs`` function
        to sample from them, or can be given as a list of values,
        where a uniform distribution is assumed.

    n_iter : integer
        Number of parameter settings that are produced.

    random_state : int or RandomState
        Pseudo random number generator state used for random uniform sampling
        from lists of possible values instead of scipy.stats distributions.

    Returns
    -------
    params : dict of string to any
        **Yields** dictionaries mapping each estimator parameter to
        as sampled value.

    Examples
    --------
    >>> from WORC.classification.AdvancedSampler import HaltonSampler
    >>> from scipy.stats.distributions import expon
    >>> import numpy as np
    >>> np.random.seed(0)
    >>> param_grid = {'a':[1, 2], 'b': expon()}
    >>> param_list = list(HaltonSampler(param_grid, n_iter=4))
    >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items())
    ...                 for d in param_list]
    >>> rounded_list == [{'b': 0.89856, 'a': 1},
    ...                  {'b': 0.923223, 'a': 1},
    ...                  {'b': 1.878964, 'a': 2},
    ...                  {'b': 1.038159, 'a': 2}]
    True
    """
[docs]    def __init__(self, param_distributions, n_iter, random_state=None,
                 method='Halton'):
        self.param_distributions = param_distributions
        self.n_iter = n_iter
        self.random_state = random_state
        self.method = method

        if method == 'Halton':
            self.Halton = Halton(len(self.param_distributions.keys()))

[docs]    def __iter__(self):
        # Create a random state to be used
        rnd = check_random_state(self.random_state)

        # Generate the sequence generator
        if self.method == 'Halton':
            sequence = self.Halton.get(self.n_iter)
            # elif self.method == 'Sobol':
            #     sequence = Sobol(len(self.param_distributions.keys()), self.n_iter)
        else:
            raise KeyError(f'Methods {self.method} not known!')

        # Always sort the keys of a dictionary, for reproducibility
        items = sorted(self.param_distributions.items())
        for i in six.moves.range(self.n_iter):
            sample = sequence[i]
            params = dict()
            for ind, (k, v) in enumerate(items):
                point = sample[ind]
                # Check if the parameter space is a distribution or a list
                if hasattr(v, "rvs"):
                    print(point)
                    # Parameter space is a distribution, hence sample
                    params[k] = v.ppf(point)
                else:
                    # Parameter space is a list, so select an index
                    point = int(round(point*float(len(v) - 1)))
                    print(point)
                    params[k] = v[point]
            yield params

        # For reproducibility, reset sampler if needed
        if self.method == 'Halton':
            self.Halton.reset()

[docs]    def __len__(self):
        """Number of points that will be sampled."""
        return self.n_iter


if __name__ == '__main__':
    random_seed = np.random.randint(1, 5000)
    random_state = check_random_state(random_seed)

    param_distributions = {'kernel': ['poly', 'RGB'],
                           'C': scipy.stats.uniform(loc=0, scale=1E6),
                           'degree': scipy.stats.uniform(loc=1, scale=6),
                           'coef0': scipy.stats.uniform(loc=0, scale=1),
                           'gamma': scipy.stats.uniform(loc=1E-5, scale=1),
                           'histogram_features': ['True', 'False']}

    n_iter = 6

    method = 'Halton'
    sampled_params = AdvancedSampler(param_distributions,
                                     n_iter,
                                     random_state)


    for s in sampled_params:
        print(s)