"""
Module for constructing models for use in MAST-ML.
SklearnModel:
Class that wraps scikit-learn models to have MAST-ML type functionality. Providing the model name as a string
and the keyword arguments for the model parameters will construct the model. Note that this class also supports
construction of XGBoost models and Keras neural network models via Keras' keras.wrappers.scikit_learn.KerasRegressor
model.
EnsembleModel:
Class that constructs a model which is an ensemble of many base models (sometimes called weak learners). This
class supports construction of ensembles of most scikit-learn regression models as well as ensembles of neural
networks that are made via Keras' keras.wrappers.scikit_learn.KerasRegressor class.
"""
import pandas as pd
import sklearn.base
import sklearn.utils
from sklearn.ensemble import BaggingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
import inspect
from pprint import pprint
import numpy as np
import re
from sklearn.base import BaseEstimator, TransformerMixin
try:
import xgboost
except:
print('XGBoost is an optional dependency. If you want to use XGBoost models, please manually install xgboost package with '
'pip install xgboost. If have error with finding libxgboost.dylib library, do'
'brew install libomp. If do not have brew on your system, first do'
' ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" from the Terminal')
try:
from sklego.linear_model import LowessRegression
except:
print('scikit-lego is an optional dependency, enabling use of the LowessRegression model. If you want to use this model, '
'do "pip install scikit-lego"')
[docs]class SklearnModel(BaseEstimator, TransformerMixin):
"""
Class to wrap any sklearn estimator, and provide some new dataframe functionality
Args:
model: (str), string denoting the name of an sklearn estimator object, e.g. KernelRidge
kwargs: keyword pairs of values to include for model, e.g. for KernelRidge can specify kernel, alpha, gamma values
Methods:
fit: method that fits the model parameters to the provided training data
Args:
X: (pd.DataFrame), dataframe of X features
y: (pd.Series), series of y target data
Returns:
fitted model
predict: method that evaluates model on new data to give predictions
Args:
X: (pd.DataFrame), dataframe of X features
as_frame: (bool), whether to return data as pandas dataframe (else numpy array)
Returns:
series or array of predicted values
help: method to output key information on class use, e.g. methods and parameters
Args:
None
Returns:
None, but outputs help to screen
"""
def __init__(self, model, **kwargs):
if model == 'XGBoostRegressor':
self.model = xgboost.XGBRegressor(**kwargs)
elif model == 'GaussianProcessRegressor':
kernel = kwargs['kernel']
kernel = _make_gpr_kernel(kernel_string=kernel)
del kwargs['kernel']
self.model = GaussianProcessRegressor(kernel=kernel, **kwargs)
elif model == 'LowessRegression':
self.model = LowessRegression(**kwargs)
else:
self.model = dict(sklearn.utils.all_estimators())[model](**kwargs)
[docs] def fit(self, X, y):
return self.model.fit(X, y)
[docs] def predict(self, X, as_frame=True):
if as_frame == True:
return pd.DataFrame(self.model.predict(X), columns=['y_pred']).squeeze()
else:
return self.model.predict(X).ravel()
def predict_proba(self, X):
if hasattr(self.model, 'predict_proba'):
return self.model.predict_proba(X)
[docs] def get_params(self, deep=True):
return self.model.get_params(deep)
[docs] def help(self):
print('Documentation for', self.model)
pprint(dict(inspect.getmembers(self.model))['__doc__'])
print('\n')
print('Class methods for,', self.model)
pprint(dict(inspect.getmembers(self.model, predicate=inspect.ismethod)))
print('\n')
print('Class attributes for,', self.model)
pprint(self.model.__dict__)
return
[docs]class EnsembleModel(BaseEstimator, TransformerMixin):
"""
Class used to construct ensemble models with a particular number and type of weak learner (base model). The
ensemble model is compatible with most scikit-learn regressor models and KerasRegressor models
Args:
model: (str), string name denoting the name of the model type to use as the base model
n_estimators: (int), the number of base models to include in the ensemble
kwargs: keyword arguments for the base model parameter names and values
Methods:
fit: method that fits the model parameters to the provided training data
Args:
X: (pd.DataFrame), dataframe of X features
y: (pd.Series), series of y target data
Returns:
fitted model
predict: method that evaluates model on new data to give predictions
Args:
X: (pd.DataFrame), dataframe of X features
as_frame: (bool), whether to return data as pandas dataframe (else numpy array)
Returns:
series or array of predicted values
get_params: method to output key model parameters
Args:
deep: (bool), determines the extent of information returned, default True
Returns:
information on model parameters
"""
def __init__(self, model, n_estimators, **kwargs):
super(EnsembleModel, self).__init__()
try:
if model == 'XGBoostRegressor':
model = xgboost.XGBRegressor(**kwargs)
elif model == 'GaussianProcessRegressor':
kernel = kwargs['kernel']
kernel = _make_gpr_kernel(kernel_string=kernel)
del kwargs['kernel']
model = GaussianProcessRegressor(kernel=kernel, **kwargs)
else:
model = dict(sklearn.utils.all_estimators())[model](**kwargs)
except:
print('Could not find designated model type in scikit-learn model library. Note the other supported model'
'type is the keras.wrappers.scikit_learn.KerasRegressor model')
self.n_estimators = n_estimators
self.model = BaggingRegressor(base_estimator=model, n_estimators=self.n_estimators)
self.base_estimator_ = model.__class__.__name__
[docs] def fit(self, X, y):
return self.model.fit(X, y)
[docs] def predict(self, X, as_frame=True):
if as_frame == True:
return pd.DataFrame(self.model.predict(X), columns=['y_pred']).squeeze()
else:
return self.model.predict(X).ravel()
[docs] def get_params(self, deep=True):
return self.model.get_params(deep)
def _make_gpr_kernel(kernel_string):
"""
Method to transform a supplied string to a kernel object for use in GPR models
Args:
kernel_string: (str), a string containing the desired name of the kernel
Return:
kernel: sklearn.gaussian_process.kernels object
"""
kernel_list = ['WhiteKernel', 'RBF', 'ConstantKernel', 'Matern', 'RationalQuadratic', 'ExpSineSquared', 'DotProduct']
kernel_operators = ['+', '*', '-']
# Parse kernel_string to identify kernel types and any kernel operations to combine kernels
kernel_types_asstr = list()
kernel_types_ascls = list()
kernel_operators_used = list()
for s in kernel_string[:]:
if s in kernel_operators:
kernel_operators_used.append(s)
# Do case for single kernel, no operators
if len(kernel_operators_used) == 0:
kernel_types_asstr.append(kernel_string)
else:
# New method, using re
unique_operators = np.unique(kernel_operators_used).tolist()
unique_operators_asstr = '['
for i in unique_operators:
unique_operators_asstr += str(i)
unique_operators_asstr += ']'
kernel_types_asstr = re.split(unique_operators_asstr, kernel_string)
for kernel in kernel_types_asstr:
kernel_ = getattr(sklearn.gaussian_process.kernels, kernel)
kernel_types_ascls.append(kernel_())
# Case for single kernel
if len(kernel_types_ascls) == 1:
kernel = kernel_types_ascls[0]
kernel_count = 0
for i, operator in enumerate(kernel_operators_used):
if i+1 <= len(kernel_operators_used):
if operator == "+":
if kernel_count == 0:
kernel = kernel_types_ascls[kernel_count] + kernel_types_ascls[kernel_count+1]
else:
kernel += kernel_types_ascls[kernel_count+1]
elif operator == "*":
if kernel_count == 0:
kernel = kernel_types_ascls[kernel_count] * kernel_types_ascls[kernel_count+1]
else:
kernel *= kernel_types_ascls[kernel_count+1]
else:
print('Warning: You have chosen an invalid operator to construct a composite kernel. Please choose'
' either "+" or "*".')
kernel_count += 1
return kernel