Source code for mastml.learning_curve

"""
This module contains methods to construct learning curves, which evaluate some cross-validation performance metric
(e.g. RMSE) as a function of amount of training data (i.e. a data learning curve) or as a function of the number of
features used in the fitting (i.e. a feature learning curve).

LearningCurve:
    Class used to construct data learning curves and feature learning curves

"""

import numpy as np
import pandas as pd
import os
import sys
from datetime import datetime

from sklearn.model_selection import learning_curve
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold

from mastml.metrics import Metrics
from mastml.feature_selectors import SklearnFeatureSelector
from mastml.plots import Line

[docs]class LearningCurve(): """ This class is used to construct learning curves, both in the form of model performance vs. amount of training data and model performance vs. number of features used in the fit. Args: None Methods: evaluate: Sets up a save directory and performs both the data and feature-based learning curves Args: model: (SklearnModel or EnsembleModel), a model made in MAST-ML X: (pd.DataFrame), dataframe containing the X feature matrix y: (pd.Series), series containing the target y data savepath: (str), string denoting the savepath to save the learning curve output groups: (pd.Series), series of group designation train_sizes: (list or np.array), list or array of floats denoting fractions of training data to evaluate for data learning curve cv: (scikit-learn cross-validation object), a scikit-learn cross-validation object scoring: (str), string denoting name of regression metric to evaluate learning curves. See mastml.metrics.Metrics._metric_zoo for full list selector: (mastml.feature_selector), a mastml.feature_selectors instance make_plot: (bool), whether or not to make the learning curve plots data_learning_curve: Method that calculates the model CV score as a function of amount of training data used Args: model: (SklearnModel or EnsembleModel), a model made in MAST-ML X: (pd.DataFrame), dataframe containing the X feature matrix y: (pd.Series), series containing the target y data savepath: (str), string denoting the savepath to save the learning curve output groups: (pd.Series), series of group designation train_sizes: (list or np.array), list or array of floats denoting fractions of training data to evaluate for data learning curve cv: (scikit-learn cross-validation object), a scikit-learn cross-validation object scoring: (str), string denoting name of regression metric to evaluate learning curves. See mastml.metrics.Metrics._metric_zoo for full list make_plot: (bool), whether or not to make the learning curve plots Returns: None feature_learning_curve: Method that calculates the model CV score as a function of the number of features used Args: model: (SklearnModel or EnsembleModel), a model made in MAST-ML X: (pd.DataFrame), dataframe containing the X feature matrix y: (pd.Series), series containing the target y data savepath: (str), string denoting the savepath to save the learning curve output groups: (pd.Series), series of group designation cv: (scikit-learn cross-validation object), a scikit-learn cross-validation object scoring: (str), string denoting name of regression metric to evaluate learning curves. See mastml.metrics.Metrics._metric_zoo for full list selector: (mastml.feature_selector), a mastml.feature_selectors instance make_plot: (bool), whether or not to make the learning curve plots Returns: None _setup_savedir: Method to create the output save directory for learning curve data Args: savepath: (str), string denoting the base path to save the output to Returns: splitdir: (str), path where learning curve data will be saved to """ def __init__(self): pass
[docs] def evaluate(self, model, X, y, savepath=None, groups=None, train_sizes=None, cv=None, scoring=None, selector=None, make_plot=True, make_new_dir=True): if savepath is None: savepath = os.getcwd() if make_new_dir is True: splitdir = self._setup_savedir(savepath=savepath) self.splitdir = splitdir savepath = splitdir self.data_learning_curve(model=model, X=X, y=y, savepath=savepath, groups=groups, train_sizes=train_sizes, cv=cv, scoring=scoring, make_plot=make_plot) self.feature_learning_curve(model=model, X=X, y=y, savepath=savepath, groups=groups, cv=cv, scoring=scoring, selector=selector, make_plot=make_plot) return
[docs] def data_learning_curve(self, model, X, y, savepath=None, groups=None, train_sizes=None, cv=None, scoring=None, make_plot=True): if savepath is None: savepath = os.getcwd() if train_sizes is None: train_sizes = np.linspace(0.1, 1.0, 5) if cv is None: cv = 5 if model.__class__.__name__=='SklearnModel': model = model.model metrics = Metrics(metrics_list=None)._metric_zoo() if scoring is None: score_name = 'mean_absolute_error' scoring = make_scorer(metrics['mean_absolute_error'][1], greater_is_better=True) #Note using True b/c if False then sklearn multiplies by -1 else: score_name = scoring scoring = make_scorer(metrics[scoring][1], greater_is_better=True) #Note using True b/c if False then sklearn multiplies by -1 train_sizes, train_scores, valid_scores = learning_curve(estimator=model, X=X, y=y, train_sizes=train_sizes, scoring=scoring, cv=cv, groups=groups) train_mean = np.mean(train_scores, axis=1) test_mean = np.mean(valid_scores, axis=1) train_stdev = np.std(train_scores, axis=1) test_stdev = np.std(valid_scores, axis=1) datadict = {"train_sizes": train_sizes, "train_mean": train_mean, "train_std": train_stdev, "test_mean": test_mean, "test_std": test_stdev} pd.DataFrame().from_dict(data=datadict).to_excel(os.path.join(savepath, 'data_learning_curve.xlsx'), index=False) if make_plot is True: Line().plot_learning_curve(train_sizes=train_sizes, train_mean=train_mean, test_mean=test_mean, train_stdev=train_stdev, test_stdev=test_stdev, learning_curve_type='data_learning_curve', score_name=score_name, savepath=savepath) return
[docs] def feature_learning_curve(self, model, X, y, savepath=None, groups=None, cv=None, scoring=None, selector=None, make_plot=True): if savepath is None: savepath = os.getcwd() if cv is None: cv = KFold(n_splits=5, shuffle=True) if model.__class__.__name__ == 'SklearnModel': model = model.model splits = cv.split(X, y, groups) train_inds = list() test_inds = list() for train, test in splits: train_inds.append(train) test_inds.append(test) metrics = Metrics(metrics_list=None)._metric_zoo() if scoring is None: score_name = 'mean_absolute_error' scoring = make_scorer(metrics['mean_absolute_error'][1], greater_is_better=True) #Note using True b/c if False then sklearn multiplies by -1 else: score_name = scoring scoring = make_scorer(metrics[scoring][1], greater_is_better=True) #Note using True b/c if False then sklearn multiplies by -1 if selector is None: selector_name = 'SequentialFeatureSelector' selector = SklearnFeatureSelector(selector='SequentialFeatureSelector', estimator=model, n_features_to_select=X.shape[1]-1, scoring=scoring, cv=cv) else: try: selector_name = selector.selector.__class__.__name__ except: selector_name = selector.__class__.__name__ train_mean = list() train_stdev = list() test_mean = list() test_stdev = list() if selector_name == 'RFE': print("Using RFE as feature selector does not support a custom CV or grouping scheme. Your learning " "curve will be generated properly, but will not use the custom CV or grouping scheme") try: Xnew = selector.fit(X=X, y=y).transform(X=X) except RuntimeError: print("You have specified an estimator for RFE that does not have a coef_ or feature_importances_ attribute. " "Acceptable models to use with RFE include: LinearRegression, Lasso, SVR, DecisionTreeRegressor, " "RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, etc.") sys.exit() elif selector_name == 'SelectKBest': print("Using SelectKBest as feature selector does not support a custom estimator model, CV or grouping scheme. " "Your learning curve will be generated properly, but will not use the custom model, CV or grouping scheme") Xnew = selector.fit(X=X, y=y).transform(X=X) else: Xnew = selector.fit(X=X, y=y).transform(X=X) # save selected features for each iteration to text file with open(os.path.join(savepath, 'selected_features.txt'), 'w') as f: features_selected = Xnew.columns.tolist() for feature in features_selected: f.write(str(feature)+'\n') train_scores = dict() test_scores = dict() train_sizes = list() y = np.array(y) for n_features in range(len(features_selected)): train_sizes.append(n_features+1) Xnew_subset = Xnew.iloc[:, 0:n_features+1] cv_number = 1 Xnew_subset = np.array(Xnew_subset) if n_features+1 == 1: Xnew_subset.reshape(-1, 1) for trains, tests in zip(train_inds, test_inds): model = model.fit(Xnew_subset[trains], y[trains]) train_vals = model.predict(Xnew_subset[trains]) test_vals = model.predict(Xnew_subset[tests]) train_scores[cv_number] = scoring._score_func(train_vals, y[trains]) test_scores[cv_number] = scoring._score_func(test_vals, y[tests]) cv_number += 1 train_mean.append(np.mean(list(train_scores.values()))) train_stdev.append(np.std(list(train_scores.values()))) test_mean.append(np.mean(list(test_scores.values()))) test_stdev.append(np.std(list(test_scores.values()))) train_sizes = np.array(train_sizes) train_mean = np.array(train_mean) train_stdev = np.array(train_stdev) test_mean = np.array(test_mean) test_stdev = np.array(test_stdev) datadict = {"train_sizes": train_sizes, "features selected": features_selected, "train_mean": train_mean, "train_std": train_stdev, "test_mean": test_mean, "test_std": test_stdev} pd.DataFrame().from_dict(data=datadict).to_excel(os.path.join(savepath, 'feature_learning_curve.xlsx'), index=False) if make_plot is True: Line().plot_learning_curve(train_sizes=train_sizes, train_mean=train_mean, test_mean=test_mean, train_stdev=train_stdev, test_stdev=test_stdev, learning_curve_type='feature_learning_curve', score_name=score_name, savepath=savepath) return
def _setup_savedir(self, savepath): now = datetime.now() dirname = 'LearningCurve' dirname = f"{dirname}_{now.month:02d}_{now.day:02d}" \ f"_{now.hour:02d}_{now.minute:02d}_{now.second:02d}" if savepath == None: splitdir = os.getcwd() else: splitdir = os.path.join(savepath, dirname) if not os.path.exists(splitdir): os.mkdir(splitdir) self.splitdir = splitdir return splitdir