Source code for mastml.models

"""
Module for constructing models for use in MAST-ML.

SklearnModel:
    Class that wraps scikit-learn models to have MAST-ML type functionality. Providing the model name as a string
    and the keyword arguments for the model parameters will construct the model. Note that this class also supports
    construction of XGBoost models and Keras neural network models via Keras' keras.wrappers.scikit_learn.KerasRegressor
    model.

EnsembleModel:
    Class that constructs a model which is an ensemble of many base models (sometimes called weak learners). This
    class supports construction of ensembles of most scikit-learn regression models as well as ensembles of neural
    networks that are made via Keras' keras.wrappers.scikit_learn.KerasRegressor class.

"""

import pandas as pd
import sklearn.base
import sklearn.utils
from sklearn.ensemble import BaggingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
import inspect
from pprint import pprint
import numpy as np
import re

from sklearn.base import BaseEstimator, TransformerMixin

try:
    import xgboost
except:
    print('XGBoost is an optional dependency. If you want to use XGBoost models, please manually install xgboost package with '
          'pip install xgboost. If have error with finding libxgboost.dylib library, do'
          'brew install libomp. If do not have brew on your system, first do'
          ' ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" from the Terminal')
try:
    from sklego.linear_model import LowessRegression
except:
    print('scikit-lego is an optional dependency, enabling use of the LowessRegression model. If you want to use this model, '
          'do "pip install scikit-lego"')

[docs]class SklearnModel(BaseEstimator, TransformerMixin):
    """
    Class to wrap any sklearn estimator, and provide some new dataframe functionality

    Args:
        model: (str), string denoting the name of an sklearn estimator object, e.g. KernelRidge

        kwargs: keyword pairs of values to include for model, e.g. for KernelRidge can specify kernel, alpha, gamma values

    Methods:
        fit: method that fits the model parameters to the provided training data
            Args:
                X: (pd.DataFrame), dataframe of X features

                y: (pd.Series), series of y target data

            Returns:
                fitted model

        predict: method that evaluates model on new data to give predictions
            Args:
                X: (pd.DataFrame), dataframe of X features

                as_frame: (bool), whether to return data as pandas dataframe (else numpy array)

            Returns:
                series or array of predicted values

        help: method to output key information on class use, e.g. methods and parameters
            Args:
                None

            Returns:
                None, but outputs help to screen
    """
    def __init__(self, model, **kwargs):
        if model == 'XGBoostRegressor':
            self.model = xgboost.XGBRegressor(**kwargs)
        elif model == 'GaussianProcessRegressor':
            kernel = kwargs['kernel']
            kernel = _make_gpr_kernel(kernel_string=kernel)
            del kwargs['kernel']
            self.model = GaussianProcessRegressor(kernel=kernel, **kwargs)
        elif model == 'LowessRegression':
            self.model = LowessRegression(**kwargs)
        else:
            self.model = dict(sklearn.utils.all_estimators())[model](**kwargs)

[docs]    def fit(self, X, y):
        return self.model.fit(X, y)

[docs]    def predict(self, X, as_frame=True):
        if as_frame == True:
            return pd.DataFrame(self.model.predict(X), columns=['y_pred']).squeeze()
        else:
            return self.model.predict(X).ravel()

    def predict_proba(self, X):
        if hasattr(self.model, 'predict_proba'):
            return self.model.predict_proba(X)

[docs]    def get_params(self, deep=True):
        return self.model.get_params(deep)

[docs]    def help(self):
        print('Documentation for', self.model)
        pprint(dict(inspect.getmembers(self.model))['__doc__'])
        print('\n')
        print('Class methods for,', self.model)
        pprint(dict(inspect.getmembers(self.model, predicate=inspect.ismethod)))
        print('\n')
        print('Class attributes for,', self.model)
        pprint(self.model.__dict__)
        return

[docs]class EnsembleModel(BaseEstimator, TransformerMixin):
    """
    Class used to construct ensemble models with a particular number and type of weak learner (base model). The
    ensemble model is compatible with most scikit-learn regressor models and KerasRegressor models

    Args:
        model: (str), string name denoting the name of the model type to use as the base model

        n_estimators: (int), the number of base models to include in the ensemble

        kwargs: keyword arguments for the base model parameter names and values

    Methods:
        fit: method that fits the model parameters to the provided training data
            Args:
                X: (pd.DataFrame), dataframe of X features

                y: (pd.Series), series of y target data

            Returns:
                fitted model

        predict: method that evaluates model on new data to give predictions
            Args:
                X: (pd.DataFrame), dataframe of X features

                as_frame: (bool), whether to return data as pandas dataframe (else numpy array)

            Returns:
                series or array of predicted values

        get_params: method to output key model parameters
            Args:
                deep: (bool), determines the extent of information returned, default True

            Returns:
                information on model parameters
    """
    def __init__(self, model, n_estimators, **kwargs):
        super(EnsembleModel, self).__init__()
        try:
            if model == 'XGBoostRegressor':
                model = xgboost.XGBRegressor(**kwargs)
            elif model == 'GaussianProcessRegressor':
                kernel = kwargs['kernel']
                kernel = _make_gpr_kernel(kernel_string=kernel)
                del kwargs['kernel']
                model = GaussianProcessRegressor(kernel=kernel, **kwargs)
            else:
                model = dict(sklearn.utils.all_estimators())[model](**kwargs)
        except:
            print('Could not find designated model type in scikit-learn model library. Note the other supported model'
                  'type is the keras.wrappers.scikit_learn.KerasRegressor model')
        self.n_estimators = n_estimators
        self.model = BaggingRegressor(base_estimator=model, n_estimators=self.n_estimators)
        self.base_estimator_ = model.__class__.__name__

[docs]    def fit(self, X, y):
        return self.model.fit(X, y)

[docs]    def predict(self, X, as_frame=True):
        if as_frame == True:
            return pd.DataFrame(self.model.predict(X), columns=['y_pred']).squeeze()
        else:
            return self.model.predict(X).ravel()

[docs]    def get_params(self, deep=True):
        return self.model.get_params(deep)


def _make_gpr_kernel(kernel_string):
    """
    Method to transform a supplied string to a kernel object for use in GPR models

    Args:
        kernel_string: (str), a string containing the desired name of the kernel

    Return:
        kernel: sklearn.gaussian_process.kernels object

    """
    kernel_list = ['WhiteKernel', 'RBF', 'ConstantKernel', 'Matern', 'RationalQuadratic', 'ExpSineSquared', 'DotProduct']
    kernel_operators = ['+', '*', '-']
    # Parse kernel_string to identify kernel types and any kernel operations to combine kernels
    kernel_types_asstr = list()
    kernel_types_ascls = list()
    kernel_operators_used = list()

    for s in kernel_string[:]:
        if s in kernel_operators:
            kernel_operators_used.append(s)

    # Do case for single kernel, no operators
    if len(kernel_operators_used) == 0:
        kernel_types_asstr.append(kernel_string)
    else:
        # New method, using re
        unique_operators = np.unique(kernel_operators_used).tolist()
        unique_operators_asstr = '['
        for i in unique_operators:
            unique_operators_asstr += str(i)
        unique_operators_asstr += ']'
        kernel_types_asstr = re.split(unique_operators_asstr, kernel_string)

    for kernel in kernel_types_asstr:
        kernel_ = getattr(sklearn.gaussian_process.kernels, kernel)
        kernel_types_ascls.append(kernel_())

    # Case for single kernel
    if len(kernel_types_ascls) == 1:
        kernel = kernel_types_ascls[0]

    kernel_count = 0
    for i, operator in enumerate(kernel_operators_used):
        if i+1 <= len(kernel_operators_used):
            if operator == "+":
                if kernel_count == 0:
                    kernel = kernel_types_ascls[kernel_count] + kernel_types_ascls[kernel_count+1]
                else:
                    kernel += kernel_types_ascls[kernel_count+1]
            elif operator == "*":
                if kernel_count == 0:
                    kernel = kernel_types_ascls[kernel_count] * kernel_types_ascls[kernel_count+1]
                else:
                    kernel *= kernel_types_ascls[kernel_count+1]
            else:
                print('Warning: You have chosen an invalid operator to construct a composite kernel. Please choose'
                              ' either "+" or "*".')
            kernel_count += 1

    return kernel