"""
This module contains methods for optimizing hyperparameters of models
HyperOptUtils:
This class contains various helper utilities for setting up and running hyperparameter optimization
GridSearch:
This class performs a basic grid search over the parameters and value ranges of interest to find the best
set of model hyperparameters in the provided grid of values
RandomizedSearch:
This class performs a randomized search over the parameters and value ranges of interest to find the best
set of model hyperparameters in the provided grid of values. Often faster than GridSearch. Instead of a grid
of values, it takes a probability distribution name as input (e.g. "norm")
BayesianSearch:
This class performs a Bayesian search over the parameters and value ranges of interest to find the best
set of model hyperparameters in the provided grid of values. Often faster than GridSearch.
"""
import sklearn.model_selection as ms
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import make_scorer
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import scipy.stats
import pandas as pd
import numpy as np
import os
from ast import literal_eval
from mastml.models import SklearnModel
from mastml.metrics import Metrics
[docs]class HyperOptUtils():
"""
Helper class providing useful methods for other hyperparameter optimization classes.
Args:
param_names: (list), list containing names of hyperparams to optimize
param_values: (list), list containing values of hyperparams to optimize
Methods:
_search_space_generator : parses GridSearch param_dict and checks values
Args:
params: (dict), dict of {param_name : param_value} pairs.
Returns:
params_: (dict), dict of {param_name : param_value} pairs.
_save_output : saves hyperparameter optimization output and best values to csv file
Args:
savepath: (str), path of output directory
data: (dict), dict of {estimator_name : hyper_opt.GridSearch.fit()} object, or equivalent
Returns:
None
_get_grid_param_dict : configures the param_dict for GridSearch
Args:
None
Returns:
param_dict: (dict), dict of {param_name : param_value} pairs.
_get_randomized_param_dict : configures the param_dict for RandomSearch
Args:
None
Returns:
param_dict: (dict), dict of {param_name : param_value} pairs.
_get_bayesian_param_dict : configures the param_dict for BayesianSearch
Args:
None
Returns:
param_dict: (dict), dict of {param_name : param_value} pairs.
"""
def __init__(self, param_names, param_values):
self.param_names = param_names
self.param_values = param_values
def _search_space_generator(self, params):
params_ = dict()
for param_name, param_vals in params.items():
if 'int' in param_vals:
dtype = 'int'
elif 'float' in param_vals:
dtype = 'float'
elif 'str' in param_vals:
dtype = 'str'
param_vals.remove('str')
elif 'tup' in param_vals:
is_tuple = True
param_vals.remove('tup')
else:
print('Error: You must specify datatype as int, float or str (last entry in param values for a given parameter)')
try:
if param_vals[3] == "lin":
params_[param_name] = np.linspace(float(param_vals[0]), float(param_vals[1]), num=int(param_vals[2]), dtype=dtype)
elif param_vals[3] == "log":
params_[param_name] = np.logspace(float(param_vals[0]), float(param_vals[1]), num=int(param_vals[2]), dtype=dtype)
else:
if is_tuple is True:
param_vals = [literal_eval(param_val) for param_val in param_vals]
params_[param_name] = np.array(param_vals)
else:
print('You must specify either lin or log scaling for GridSearch, or be specifying a set of tuples')
exit()
except:
params_[param_name] = param_vals
return params_
def _save_output(self, savepath, data):
for key in data:
d = data[key]
c = dict((k, d.cv_results_[k]) for k in ('mean_test_score', 'std_test_score'))
# Bayesian search does not report test scores and will error out
#('mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score'))
out = pd.DataFrame(c, d.cv_results_['params'])
try:
best = pd.DataFrame(d.best_params_, index=['Best Parameters'])
except:
best = pd.DataFrame(d.best_params_)
out.to_excel(os.path.join(savepath, self.__class__.__name__+"_"+str(key)+'_output.xlsx'))
best.to_excel(os.path.join(savepath, self.__class__.__name__+"_"+str(key)+'_bestparams.xlsx'))
return
def _get_grid_param_dict(self):
param_dict = dict()
try:
for name, value_string in zip(self.param_names.split(';'), self.param_values.split(';')):
param_dict[name] = value_string
except:
print('Error: An error occurred when trying to parse the hyperparam input values.'
' Please check your input file for errors. Remember values need to be delimited by semicolons')
exit()
# Clean spaces in names and value strings
param_dict_ = dict()
for name, value_string in param_dict.items():
if name[0] == ' ':
name = name[1:]
if name[-1] == ' ':
name = name[0:-1]
if value_string[0] == ' ':
value_string = value_string[1:]
if value_string[-1] == ' ':
value_string = value_string[0:-1]
param_dict_[name] = value_string.split(' ')
param_dict = param_dict_
return param_dict
def _get_randomized_param_dict(self):
param_dict = self._get_grid_param_dict()
param_dict_ = dict()
# Now fetch scipy.stats probability distributions from provided strings
for param_name, param_val in param_dict.items():
# Try making param_val a scipy.stats object. If it isn't assume it's to be made into a list (e.g. case of trying different string args).
try:
dist = getattr(scipy.stats, param_val[0])
param_dict_[param_name] = dist
except AttributeError:
param_dict_[param_name] = param_val
param_dict = param_dict_
return param_dict
def _get_bayesian_param_dict(self):
param_dict = self._get_grid_param_dict()
param_dict_ = dict()
# Construct prior distribution for Bayesian search
for param_name, param_val in param_dict.items():
param_val_split = param_val
is_str = False
is_int = False
is_float = False
if param_val_split[-1] == 'int':
is_int = True
elif param_val_split[-1] == 'float':
is_float = True
elif param_val_split[-1] == 'str':
is_str = True
else:
print('An error occurred with parsing your param_dict for hyperparam optimization. You must choose one of'
'[int, float, str] as second to last entry for Bayesian Search')
prior = str(param_val_split[-2])
if prior == 'log':
prior = 'log-uniform'
# If someone specifies log spacing, assume they mean to have floats and not ints for linear spacing, and
# override is_int = True from above
if is_float is True:
start = float(10**(float(param_val_split[0])))
end = float(10**(float(param_val_split[1])))
elif is_int is True:
start = int(param_val_split[0])
end = int(param_val_split[1])
if prior == 'lin':
prior = 'uniform'
if is_int is True:
start = int(param_val_split[0])
end = int(param_val_split[1])
param_val_ = Integer(start, end)
elif is_float is True:
if prior == 'uniform':
start = float(param_val_split[0])
end = float(param_val_split[1])
elif prior == 'log-uniform':
start = float(10**(float(param_val_split[0])))
end = float(10**(float(param_val_split[1])))
param_val_ = Real(start, end, prior=prior)
elif is_str is True:
param_val_ = Categorical([s for s in param_val_split if s not in ['int', 'float', 'str', 'lin', 'log']])
else:
print('Your hyperparam input values were not parsed correctly, possibly due to unreasonable value choices'
'(e.g. negative values when only positive values make sense). Please check your input file and '
're-run MAST-ML.')
exit()
param_dict_[param_name] = param_val_
param_dict = param_dict_
return param_dict
[docs]class GridSearch(HyperOptUtils):
"""
Class to conduct a grid search to find optimized model hyperparameter values
Args:
param_names: (list), list containing names of hyperparams to optimize
param_values: (list), list containing values of hyperparams to optimize
scoring: (str), string denoting name of regression metric to evaluate learning curves. See mastml.metrics.Metrics._metric_zoo for full list
n_jobs: (int), number of jobs to run in parallel. Can speed up calculation when using multiple cores
Methods:
fit : optimizes hyperparameters
Args:
X: (pd.DataFrame), dataframe of X feature data
y: (pd.Series), series of target y data
model: (mastml.models object), a MAST-ML model, e.g. SklearnModel or EnsembleModel
cv: (scikit-learn cross-validation object), a scikit-learn cross-validation object
savepath: (str), path of output directory
Returns:
best_estimator (mastml.models object) : the optimized MAST-ML model
"""
def __init__(self, param_names, param_values, scoring=None, n_jobs=1):
super(GridSearch, self).__init__(param_names=param_names, param_values=param_values)
self.param_names = param_names
self.param_values = param_values
self.scoring = scoring
self.n_jobs = int(n_jobs)
[docs] def fit(self, X, y, model, cv=None, savepath=None):
rst = dict()
param_dict = self._get_grid_param_dict()
if savepath is None:
savepath = os.getcwd()
estimator_name = model.model.__class__.__name__
param_dict = self._search_space_generator(param_dict)
if cv is None:
cv = ms.RepeatedKFold()
metrics = Metrics(metrics_list=None)._metric_zoo()
if self.scoring is None:
scoring = make_scorer(metrics['mean_absolute_error'][1],
greater_is_better=metrics['mean_absolute_error'][0]) # Note using True b/c if False then sklearn multiplies by -1
else:
scoring = make_scorer(metrics[self.scoring][1],
greater_is_better=metrics[self.scoring][0]) # Note using True b/c if False then sklearn multiplies by -1
model = GridSearchCV(model.model,
param_dict,
scoring=scoring,
cv=cv,
refit=True,
n_jobs=self.n_jobs,
verbose=0)
try:
rst[estimator_name] = model.fit(X, y)
except:
print('Hyperparameter optimization failed, likely due to inappropriate domain of values to optimize'
' one or more parameters over. Please check your input file and the sklearn docs for the mode'
' you are optimizing for the domain of correct values')
exit()
best_estimator = rst[estimator_name].best_estimator_
self._save_output(savepath, rst)
# Need to rebuild the estimator as SklearnModel
best_estimator = SklearnModel(model=best_estimator.__class__.__name__, **best_estimator.get_params())
return best_estimator
[docs]class RandomizedSearch(HyperOptUtils):
"""
Class to conduct a randomized search to find optimized model hyperparameter values
Args:
param_names: (list), list containing names of hyperparams to optimize
param_values: (list), list containing values of hyperparams to optimize
scoring: (str), string denoting name of regression metric to evaluate learning curves. See mastml.metrics.Metrics._metric_zoo for full list
n_iter: (int), number denoting the number of evaluations in the search space to perform. Higher numbers will take longer but will be more accurate
n_jobs: (int), number of jobs to run in parallel. Can speed up calculation when using multiple cores
Methods:
fit : optimizes hyperparameters
Args:
X: (pd.DataFrame), dataframe of X feature data
y: (pd.Series), series of target y data
model: (mastml.models object), a MAST-ML model, e.g. SklearnModel or EnsembleModel
cv: (scikit-learn cross-validation object), a scikit-learn cross-validation object
savepath: (str), path of output directory
Returns:
best_estimator (mastml.models object) : the optimized MAST-ML model
"""
def __init__(self, param_names, param_values, scoring=None, n_iter=50, n_jobs=1):
super(RandomizedSearch, self).__init__(param_names=param_names, param_values=param_values)
self.param_names = param_names
self.param_values = param_values
self.scoring = scoring
self.n_iter = int(n_iter)
self.n_jobs = int(n_jobs)
[docs] def fit(self, X, y, model, cv=None, savepath=None, refit=True):
rst = dict()
param_dict = self._get_randomized_param_dict()
if savepath is None:
savepath = os.getcwd()
estimator_name = model.model.__class__.__name__
if cv is None:
cv = ms.RepeatedKFold()
metrics = Metrics(metrics_list=None)._metric_zoo()
if self.scoring is None:
scoring = make_scorer(metrics['mean_absolute_error'][1],
greater_is_better=metrics['mean_absolute_error'][0]) # Note using True b/c if False then sklearn multiplies by -1
else:
scoring = make_scorer(metrics[self.scoring][1],
greater_is_better=metrics[self.scoring][0]) # Note using True b/c if False then sklearn multiplies by -1
model = RandomizedSearchCV(model.model,
param_dict,
n_iter=self.n_iter,
scoring=scoring,
cv=cv,
refit=refit,
n_jobs=self.n_jobs,
verbose=0)
try:
rst[estimator_name] = model.fit(X, y)
except:
print('Hyperparameter optimization failed, likely due to inappropriate domain of values to optimize'
' one or more parameters over. Please check your input file and the sklearn docs for the mode'
' you are optimizing for the domain of correct values')
exit()
best_estimator = rst[estimator_name].best_estimator_
# Need to rebuild the best estimator back into SklearnModel object
best_estimator = SklearnModel(model=best_estimator.__class__.__name__, **best_estimator.get_params())
self._save_output(savepath, rst)
return best_estimator
# NOTE: there is a known problem where BayesSearchCV in skopt doesn't work with sklearn 0.24 (deprecated iid parameter).
# They are working on fixing this (as of 2/4/21). See updates at https://github.com/scikit-optimize/scikit-optimize/issues/978
[docs]class BayesianSearch(HyperOptUtils):
"""
Class to conduct a Bayesian search to find optimized model hyperparameter values
Args:
param_names: (list), list containing names of hyperparams to optimize
param_values: (list), list containing values of hyperparams to optimize
scoring: (str), string denoting name of regression metric to evaluate learning curves. See mastml.metrics.Metrics._metric_zoo for full list
n_iter: (int), number denoting the number of evaluations in the search space to perform. Higher numbers will take longer but will be more accurate
n_jobs: (int), number of jobs to run in parallel. Can speed up calculation when using multiple cores
Methods:
fit : optimizes hyperparameters
Args:
X: (pd.DataFrame), dataframe of X feature data
y: (pd.Series), series of target y data
model: (mastml.models object), a MAST-ML model, e.g. SklearnModel or EnsembleModel
cv: (scikit-learn cross-validation object), a scikit-learn cross-validation object
savepath: (str), path of output directory
Returns:
best_estimator (mastml.models object) : the optimized MAST-ML model
"""
def __init__(self, param_names, param_values, scoring=None, n_iter=50, n_jobs=1):
print('Warning: As of 2/4/21, Bayesian search from skopt is not compatible with'
' sklearn>=0.24. Downgrade to sklearn 0.23.2 should fix the issue but may cause'
' other unforseen compatibility issues in the MAST-ML code')
super(BayesianSearch, self).__init__(param_names=param_names, param_values=param_values)
self.param_names = param_names
self.param_values = param_values
self.scoring = scoring
self.n_iter = int(n_iter)
self.n_jobs = int(n_jobs)
[docs] def fit(self, X, y, model, cv, savepath=None):
rst = dict()
param_dict = self._get_bayesian_param_dict()
if savepath is None:
savepath = os.getcwd()
estimator_name = model.__class__.__name__
if cv is None:
cv = ms.RepeatedKFold()
metrics = Metrics(metrics_list=None)._metric_zoo()
if self.scoring is None:
scoring = make_scorer(metrics['mean_absolute_error'][1],
greater_is_better=metrics['mean_absolute_error'][0]) # Note using True b/c if False then sklearn multiplies by -1
else:
scoring = make_scorer(metrics[self.scoring][1],
greater_is_better=metrics[self.scoring][0]) # Note using True b/c if False then sklearn multiplies by -1
model = BayesSearchCV(estimator=model.model,
search_spaces=param_dict,
n_iter=self.n_iter,
scoring=scoring,
cv=cv,
refit=True,
n_jobs=self.n_jobs,
verbose=1)
try:
rst[estimator_name] = model.fit(X, y)
except:
print('Hyperparameter optimization failed, likely due to inappropriate domain of values to optimize'
' one or more parameters over. Please check your input file and the sklearn docs for the mode'
' you are optimizing for the domain of correct values')
exit()
best_estimator = rst[estimator_name].best_estimator_
# Need to rebuild the estimator as SklearnModel
best_estimator = SklearnModel(model=best_estimator.__class__.__name__, **best_estimator.get_params())
self._save_output(savepath, rst)
return best_estimator
@property
def _estimator_name(self):
return self.estimator.__class__.__name__