Source code for WORC.classification.construct_classifier

#!/usr/bin/env python

# Copyright 2016-2022 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from sklearn.svm import SVC
from sklearn.svm import SVR as SVMR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.linear_model import SGDClassifier, ElasticNet, SGDRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.linear_model import Ridge
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
import scipy
from WORC.classification.AdvancedSampler import log_uniform, discrete_uniform
import WORC.addexceptions as ae
from xgboost import XGBClassifier, XGBRegressor

try:
    from lightgbm import LGBMClassifier
except:
    print("[INFO] LightGCM classifier currently not available. Please see https://worc.readthedocs.io/en/latest/static/additionalfunctionality.html.")



[docs]def construct_classifier(config):
    """Interface to create classification.

    Different classifications can be created using this common interface

    Parameters
    ----------
        config: dict, mandatory
                Contains the required config settings. See the Github Wiki for
                all available fields.

    Returns:
        Constructed classifier

    """
    # NOTE: Function is not working anymore for regression: need
    # to move param grid creation to the create_param_grid function
    max_iter = config['max_iter']
    if 'SVM' in config['classifiers']:
        # Support Vector Machine
        classifier = construct_SVM(config)

    elif config['classifiers'] == 'SVR':
        # Support Vector Regression
        classifier = construct_SVM(config, True)

    elif config['classifiers'] == 'AdaBoostClassifier':
        # AdaBoost classifier
        learning_rate = config['AdaBoost_learning_rate']
        n_estimators = config['AdaBoost_n_estimators']
        classifier = AdaBoostClassifier(n_estimators=n_estimators,
                                        learning_rate=learning_rate,
                                        random_state=config['random_seed'])

    elif config['classifiers'] == 'AdaBoostRegressor':
        # AdaBoost regressor
        learning_rate = config['AdaBoost_learning_rate']
        n_estimators = config['AdaBoost_n_estimators']
        classifier = AdaBoostRegressor(n_estimators=n_estimators,
                                       learning_rate=learning_rate,
                                       random_state=config['random_seed'])

    elif config['classifiers'] == 'XGBClassifier':
        # XGB Classifier
        max_depth = config['XGB_max_depth']
        learning_rate = config['XGB_learning_rate']
        gamma = config['XGB_gamma']
        min_child_weight = config['XGB_min_child_weight']
        boosting_rounds = config['XGB_boosting_rounds']
        colsample_bytree = config['XGB_colsample_bytree']
        classifier = XGBClassifier(max_depth=max_depth,
                                   learning_rate=learning_rate,
                                   gamma=gamma,
                                   min_child_weight=min_child_weight,
                                   n_estimators=boosting_rounds,
                                   colsample_bytree=colsample_bytree,
                                   random_state=config['random_seed'],
                                   n_jobs=1)

    elif config['classifiers'] == 'XGBRegressor':
        # XGB Regressor
        max_depth = config['XGB_max_depth']
        learning_rate = config['XGB_learning_rate']
        gamma = config['XGB_gamma']
        min_child_weight = config['XGB_min_child_weight']
        boosting_rounds = config['XGB_boosting_rounds']
        colsample_bytree = config['XGB_colsample_bytree']
        classifier = XGBRegressor(max_depth=max_depth,
                                  learning_rate=learning_rate,
                                  gamma=gamma,
                                  min_child_weight=min_child_weight,
                                  n_estimators=boosting_rounds,
                                  colsample_bytree=colsample_bytree,
                                  random_state=config['random_seed'],
                                  n_jobs=1)

    elif config['classifiers'] == 'LightGBMClassifier':
        # LightGBM Classifier
        num_leaves = config['LightGBM_num_leaves']
        max_depth = config['LightGBM_max_depth']
        min_child_samples = config['LightGBM_min_child_samples']
        reg_alpha = config['LightGBM_reg_alpha']
        reg_lambda = config['LightGBM_reg_lambda']
        min_child_weight = config['LightGBM_min_child_weight']

        classifier = LGBMClassifier(num_leaves=num_leaves,
                                    max_depth=max_depth,
                                    min_child_samples=min_child_samples,
                                    reg_alpha=reg_alpha,
                                    reg_lambda=reg_lambda,
                                    min_child_weight=min_child_weight,
                                    random_state=config['random_seed'])

    elif config['classifiers'] == 'RF':
        # Random forest kernel
        classifier = RandomForestClassifier(verbose=0,
                                            class_weight='balanced',
                                            n_estimators=config['RFn_estimators'],
                                            min_samples_split=config['RFmin_samples_split'],
                                            max_depth=config['RFmax_depth'],
                                            random_state=config['random_seed'])

    elif config['classifiers'] == 'RFR':
        # Random forest kernel regression
        classifier = RandomForestRegressor(verbose=0,
                                           n_estimators=config['RFn_estimators'],
                                           min_samples_split=config['RFmin_samples_split'],
                                           max_depth=config['RFmax_depth'],
                                           random_state=config['random_seed'])

    elif config['classifiers'] == 'ElasticNet':
        # Elastic Net Regression
        classifier = ElasticNet(alpha=config['ElasticNet_alpha'],
                                l1_ratio=config['ElasticNet_l1_ratio'],
                                max_iter=max_iter,
                                random_state=config['random_seed'])

    elif config['classifiers'] == 'Lasso':
        # LASSO Regression
        classifier = Lasso(max_iter=max_iter,
                           random_state=config['random_seed'])

    elif config['classifiers'] == 'SGD':
        # Stochastic Gradient Descent classifier
        classifier = SGDClassifier(n_iter=config['max_iter'],
                                   alpha=config['SGD_alpha'],
                                   l1_ratio=config['SGD_l1_ratio'],
                                   loss=config['SGD_loss'],
                                   penalty=config['SGD_penalty'],
                                   random_state=config['random_seed'])

    elif config['classifiers'] == 'SGDR':
        # Stochastic Gradient Descent regressor
        classifier = SGDRegressor(max_iter=config['max_iter'],
                                  alpha=config['SGD_alpha'],
                                  l1_ratio=config['SGD_l1_ratio'],
                                  loss=config['SGD_loss'],
                                  penalty=config['SGD_penalty'],
                                  random_state=config['random_seed'])

    elif config['classifiers'] == 'LR':
        # Logistic Regression
        if config['LRpenalty'] == 'elasticnet' or config['LRpenalty'] == 'l1':
            # saga solver required for elasticnet
            if config['LR_solver'] != 'saga':
                p = config['LRpenalty']
                print(f"[WORC Warning] {p} penalty requires saga " +\
                      f"solver, got {config['LR_solver']}. Changing solver.")
                config['LR_solver'] = 'saga'

        classifier = LogisticRegression(max_iter=max_iter,
                                        penalty=config['LRpenalty'],
                                        solver=config['LR_solver'],
                                        l1_ratio=config['LR_l1_ratio'],
                                        C=config['LRC'],
                                        random_state=config['random_seed'])

    elif config['classifiers'] == 'LinR':
        # Linear Regression
        classifier = LinearRegression()

    elif config['classifiers'] == 'Ridge':
        # Ridge Regression
        classifier = Ridge(alpha=config['ElasticNet_alpha'],
                           max_iter=max_iter,
                           random_state=config['random_seed'])

    elif config['classifiers'] == 'GaussianNB':
        # Naive Bayes classifier using Gaussian distributions
        classifier = GaussianNB()

    elif config['classifiers'] == 'ComplementNB':
        # Complement Naive Bayes classifier
        classifier = ComplementNB()

    elif config['classifiers'] == 'LDA':
        # Linear Discriminant Analysis
        if config['LDA_solver'] == 'svd':
            # Shrinkage does not work with svd solver
            shrinkage = None
        else:
            shrinkage = config['LDA_shrinkage']

        classifier = LDA(solver=config['LDA_solver'],
                         shrinkage=shrinkage)

    elif config['classifiers'] == 'QDA':
        # Linear Discriminant Analysis
        classifier = QDA(reg_param=config['QDA_reg_param'])
    else:
        message = ('Classifier {} unknown.').format(str(config['classifiers']))
        raise ae.WORCKeyError(message)

    return classifier


[docs]def construct_SVM(config, regression=False):
    """Construct a SVM classifier.

    Args:
        config (dict): Dictionary of the required config settings
        features (pandas dataframe): A pandas dataframe containing the features
         to be used for classification

    Returns:
        SVM/SVR classifier, parameter grid

    """
    max_iter = config['max_iter']
    if not regression:
        clf = SVC(class_weight='balanced', probability=True, max_iter=max_iter,
                  random_state=config['random_seed'])
    else:
        # NOTE: SVMR has no random state
        clf = SVMR(max_iter=max_iter)

    clf.kernel = str(config['SVMKernel'])
    clf.C = config['SVMC']
    # Only add the following parameters if they are defined
    if 'SVMdegree' in config:
        clf.degree = config['SVMdegree']
    if 'SVMcoef0' in config:
        clf.coef0 = config['SVMcoef0']
    if 'SVMgamma' in config:
        clf.gamma = config['SVMgamma']

    return clf


[docs]def create_param_grid(config):
    """Create a parameter grid for the WORC classifiers."""
    # We only need parameters from the Classification part of the config
    config = config['Classification']

    # Create grid and put in name of classifiers and maximum iterations
    param_grid = dict()
    param_grid['classifiers'] = config['classifiers']
    param_grid['max_iter'] = config['max_iter']

    # SVM parameters
    param_grid['SVMKernel'] = config['SVMKernel']
    param_grid['SVMC'] = log_uniform(loc=config['SVMC'][0],
                                     scale=config['SVMC'][1])
    param_grid['SVMdegree'] = scipy.stats.uniform(loc=config['SVMdegree'][0],
                                                  scale=config['SVMdegree'][1])
    param_grid['SVMcoef0'] = scipy.stats.uniform(loc=config['SVMcoef0'][0],
                                                 scale=config['SVMcoef0'][1])
    param_grid['SVMgamma'] = log_uniform(loc=config['SVMgamma'][0],
                                         scale=config['SVMgamma'][1])

    # RF parameters
    # RF parameters
    param_grid['RFn_estimators'] =\
        discrete_uniform(loc=config['RFn_estimators'][0],
                         scale=config['RFn_estimators'][1])
    param_grid['RFmin_samples_split'] =\
        discrete_uniform(loc=config['RFmin_samples_split'][0],
                         scale=config['RFmin_samples_split'][1])
    param_grid['RFmax_depth'] =\
        discrete_uniform(loc=config['RFmax_depth'][0],
                         scale=config['RFmax_depth'][1])

    # Logistic Regression parameters
    param_grid['LRpenalty'] = config['LRpenalty']
    param_grid['LR_solver'] = config['LR_solver']
    param_grid['LR_l1_ratio'] =\
        scipy.stats.uniform(loc=config['LR_l1_ratio'][0],
                            scale=config['LR_l1_ratio'][1])
    param_grid['LRC'] = scipy.stats.uniform(loc=config['LRC'][0],
                                            scale=config['LRC'][1])

    # LDA/QDA parameters
    param_grid['LDA_solver'] = config['LDA_solver']
    param_grid['LDA_shrinkage'] = log_uniform(loc=config['LDA_shrinkage'][0],
                                              scale=config['LDA_shrinkage'][1])
    param_grid['QDA_reg_param'] = log_uniform(loc=config['QDA_reg_param'][0],
                                              scale=config['QDA_reg_param'][1])

    # ElasticNet parameters
    param_grid['ElasticNet_alpha'] =\
        log_uniform(loc=config['ElasticNet_alpha'][0],
                    scale=config['ElasticNet_alpha'][1])
    param_grid['ElasticNet_l1_ratio'] =\
        scipy.stats.uniform(loc=config['ElasticNet_l1_ratio'][0],
                            scale=config['ElasticNet_l1_ratio'][1])

    # SGD Regression parameters
    param_grid['SGD_alpha'] =\
        log_uniform(loc=config['SGD_alpha'][0],
                    scale=config['SGD_alpha'][1])

    param_grid['SGD_l1_ratio'] =\
        scipy.stats.uniform(loc=config['SGD_l1_ratio'][0],
                            scale=config['SGD_l1_ratio'][1])
    param_grid['SGD_loss'] = config['SGD_loss']
    param_grid['SGD_penalty'] = config['SGD_penalty']

    # Naive Bayes parameters
    param_grid['CNB_alpha'] =\
        scipy.stats.uniform(loc=config['CNB_alpha'][0],
                            scale=config['CNB_alpha'][1])

    # AdaBoost parameters
    param_grid['AdaBoost_n_estimators'] =\
        discrete_uniform(loc=config['AdaBoost_n_estimators'][0],
                         scale=config['AdaBoost_n_estimators'][1])

    param_grid['AdaBoost_learning_rate'] =\
        log_uniform(loc=config['AdaBoost_learning_rate'][0],
                    scale=config['AdaBoost_learning_rate'][1])

    # XGDBoost parameters
    param_grid['XGB_boosting_rounds'] =\
        discrete_uniform(loc=config['XGB_boosting_rounds'][0],
                         scale=config['XGB_boosting_rounds'][1])

    param_grid['XGB_max_depth'] =\
        discrete_uniform(loc=config['XGB_max_depth'][0],
                         scale=config['XGB_max_depth'][1])

    param_grid['XGB_learning_rate'] =\
        log_uniform(loc=config['XGB_learning_rate'][0],
                    scale=config['XGB_learning_rate'][1])

    param_grid['XGB_gamma'] =\
        scipy.stats.uniform(loc=config['XGB_gamma'][0],
                            scale=config['XGB_gamma'][1])

    param_grid['XGB_min_child_weight'] =\
        discrete_uniform(loc=config['XGB_min_child_weight'][0],
                         scale=config['XGB_min_child_weight'][1])

    param_grid['XGB_colsample_bytree'] =\
        scipy.stats.uniform(loc=config['XGB_colsample_bytree'][0],
                            scale=config['XGB_colsample_bytree'][1])

    # LightGBM
    param_grid['LightGBM_num_leaves'] =\
        discrete_uniform(loc=config['LightGBM_num_leaves'][0],
                         scale=config['LightGBM_num_leaves'][1])

    param_grid['LightGBM_max_depth'] =\
        discrete_uniform(loc=config['LightGBM_max_depth'][0],
                         scale=config['LightGBM_max_depth'][1])

    param_grid['LightGBM_min_child_samples'] =\
        discrete_uniform(loc=config['LightGBM_min_child_samples'][0],
                         scale=config['LightGBM_min_child_samples'][1])

    param_grid['LightGBM_reg_alpha'] =\
        scipy.stats.uniform(loc=config['LightGBM_reg_alpha'][0],
                            scale=config['LightGBM_reg_alpha'][1])

    param_grid['LightGBM_reg_lambda'] =\
        scipy.stats.uniform(loc=config['LightGBM_reg_lambda'][0],
                            scale=config['LightGBM_reg_lambda'][1])

    param_grid['LightGBM_min_child_weight'] =\
        log_uniform(loc=config['LightGBM_min_child_weight'][0],
                    scale=config['LightGBM_min_child_weight'][1])

    return param_grid