Source code for mastml.learning_curve

"""
This module contains methods to construct learning curves, which evaluate some cross-validation performance metric
(e.g. RMSE) as a function of amount of training data (i.e. a data learning curve) or as a function of the number of
features used in the fitting (i.e. a feature learning curve).

LearningCurve:
    Class used to construct data learning curves and feature learning curves

"""

import numpy as np
import pandas as pd
import os
import sys
from datetime import datetime

from sklearn.model_selection import learning_curve
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold

from mastml.metrics import Metrics
from mastml.feature_selectors import SklearnFeatureSelector
from mastml.plots import Line

[docs]class LearningCurve():
    """
    This class is used to construct learning curves, both in the form of model performance vs. amount of training
    data and model performance vs. number of features used in the fit.

    Args:
        None

    Methods:
        evaluate: Sets up a save directory and performs both the data and feature-based learning curves
            Args:
                model: (SklearnModel or EnsembleModel), a model made in MAST-ML

                X: (pd.DataFrame), dataframe containing the X feature matrix

                y: (pd.Series), series containing the target y data

                savepath: (str), string denoting the savepath to save the learning curve output

                groups: (pd.Series), series of group designation

                train_sizes: (list or np.array), list or array of floats denoting fractions of training data to evaluate for data learning curve

                cv: (scikit-learn cross-validation object), a scikit-learn cross-validation object

                scoring: (str), string denoting name of regression metric to evaluate learning curves. See mastml.metrics.Metrics._metric_zoo for full list

                selector: (mastml.feature_selector), a mastml.feature_selectors instance

                make_plot: (bool), whether or not to make the learning curve plots

        data_learning_curve: Method that calculates the model CV score as a function of amount of training data used
            Args:
                model: (SklearnModel or EnsembleModel), a model made in MAST-ML

                X: (pd.DataFrame), dataframe containing the X feature matrix

                y: (pd.Series), series containing the target y data

                savepath: (str), string denoting the savepath to save the learning curve output

                groups: (pd.Series), series of group designation

                train_sizes: (list or np.array), list or array of floats denoting fractions of training data to evaluate for data learning curve

                cv: (scikit-learn cross-validation object), a scikit-learn cross-validation object

                scoring: (str), string denoting name of regression metric to evaluate learning curves. See mastml.metrics.Metrics._metric_zoo for full list

                make_plot: (bool), whether or not to make the learning curve plots

            Returns:
                None

        feature_learning_curve: Method that calculates the model CV score as a function of the number of features used
            Args:
                model: (SklearnModel or EnsembleModel), a model made in MAST-ML

                X: (pd.DataFrame), dataframe containing the X feature matrix

                y: (pd.Series), series containing the target y data

                savepath: (str), string denoting the savepath to save the learning curve output

                groups: (pd.Series), series of group designation

                cv: (scikit-learn cross-validation object), a scikit-learn cross-validation object

                scoring: (str), string denoting name of regression metric to evaluate learning curves. See mastml.metrics.Metrics._metric_zoo for full list

                selector: (mastml.feature_selector), a mastml.feature_selectors instance

                make_plot: (bool), whether or not to make the learning curve plots

            Returns:
                None

        _setup_savedir: Method to create the output save directory for learning curve data
            Args:
                savepath: (str), string denoting the base path to save the output to

            Returns:
                splitdir: (str), path where learning curve data will be saved to

    """
    def __init__(self):
        pass

[docs]    def evaluate(self, model, X, y, savepath=None, groups=None, train_sizes=None, cv=None, scoring=None, selector=None,
                            make_plot=True, make_new_dir=True):
        if savepath is None:
            savepath = os.getcwd()
        if make_new_dir is True:
            splitdir = self._setup_savedir(savepath=savepath)
            self.splitdir = splitdir
            savepath = splitdir
        self.data_learning_curve(model=model,
                                 X=X,
                                 y=y,
                                 savepath=savepath,
                                 groups=groups,
                                 train_sizes=train_sizes,
                                 cv=cv,
                                 scoring=scoring,
                                 make_plot=make_plot)
        self.feature_learning_curve(model=model,
                                    X=X,
                                    y=y,
                                    savepath=savepath,
                                    groups=groups,
                                    cv=cv,
                                    scoring=scoring,
                                    selector=selector,
                                    make_plot=make_plot)
        return

[docs]    def data_learning_curve(self, model, X, y, savepath=None, groups=None, train_sizes=None, cv=None, scoring=None,
                            make_plot=True):

        if savepath is None:
            savepath = os.getcwd()
        if train_sizes is None:
            train_sizes = np.linspace(0.1, 1.0, 5)
        if cv is None:
            cv = 5
        if model.__class__.__name__=='SklearnModel':
            model = model.model
        metrics = Metrics(metrics_list=None)._metric_zoo()
        if scoring is None:
            score_name = 'mean_absolute_error'
            scoring = make_scorer(metrics['mean_absolute_error'][1], greater_is_better=True) #Note using True b/c if False then sklearn multiplies by -1
        else:
            score_name = scoring
            scoring = make_scorer(metrics[scoring][1], greater_is_better=True) #Note using True b/c if False then sklearn multiplies by -1

        train_sizes, train_scores, valid_scores = learning_curve(estimator=model,
                                                                 X=X,
                                                                 y=y,
                                                                 train_sizes=train_sizes,
                                                                 scoring=scoring,
                                                                 cv=cv,
                                                                 groups=groups)

        train_mean = np.mean(train_scores, axis=1)
        test_mean = np.mean(valid_scores, axis=1)
        train_stdev = np.std(train_scores, axis=1)
        test_stdev = np.std(valid_scores, axis=1)

        datadict = {"train_sizes": train_sizes,
                    "train_mean": train_mean,
                    "train_std": train_stdev,
                    "test_mean": test_mean,
                    "test_std": test_stdev}
        pd.DataFrame().from_dict(data=datadict).to_excel(os.path.join(savepath, 'data_learning_curve.xlsx'), index=False)

        if make_plot is True:
            Line().plot_learning_curve(train_sizes=train_sizes,
                                       train_mean=train_mean,
                                       test_mean=test_mean,
                                       train_stdev=train_stdev,
                                       test_stdev=test_stdev,
                                       learning_curve_type='data_learning_curve',
                                       score_name=score_name,
                                       savepath=savepath)

        return

[docs]    def feature_learning_curve(self, model, X, y, savepath=None, groups=None, cv=None, scoring=None, selector=None, make_plot=True):

        if savepath is None:
            savepath = os.getcwd()
        if cv is None:
            cv = KFold(n_splits=5, shuffle=True)
        if model.__class__.__name__ == 'SklearnModel':
            model = model.model

        splits = cv.split(X, y, groups)
        train_inds = list()
        test_inds = list()
        for train, test in splits:
            train_inds.append(train)
            test_inds.append(test)

        metrics = Metrics(metrics_list=None)._metric_zoo()
        if scoring is None:
            score_name = 'mean_absolute_error'
            scoring = make_scorer(metrics['mean_absolute_error'][1], greater_is_better=True) #Note using True b/c if False then sklearn multiplies by -1
        else:
            score_name = scoring
            scoring = make_scorer(metrics[scoring][1], greater_is_better=True) #Note using True b/c if False then sklearn multiplies by -1

        if selector is None:
            selector_name = 'SequentialFeatureSelector'
            selector = SklearnFeatureSelector(selector='SequentialFeatureSelector',
                                              estimator=model,
                                              n_features_to_select=X.shape[1]-1,
                                              scoring=scoring,
                                              cv=cv)
        else:
            try:
                selector_name = selector.selector.__class__.__name__
            except:
                selector_name = selector.__class__.__name__

        train_mean = list()
        train_stdev = list()
        test_mean = list()
        test_stdev = list()

        if selector_name == 'RFE':
            print("Using RFE as feature selector does not support a custom CV or grouping scheme. Your learning "
                        "curve will be generated properly, but will not use the custom CV or grouping scheme")
            try:
                Xnew = selector.fit(X=X, y=y).transform(X=X)
            except RuntimeError:
                print("You have specified an estimator for RFE that does not have a coef_ or feature_importances_ attribute. "
                          "Acceptable models to use with RFE include: LinearRegression, Lasso, SVR, DecisionTreeRegressor, "
                          "RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, etc.")
                sys.exit()
        elif selector_name == 'SelectKBest':
            print("Using SelectKBest as feature selector does not support a custom estimator model, CV or grouping scheme. "
                        "Your learning curve will be generated properly, but will not use the custom model, CV or grouping scheme")
            Xnew = selector.fit(X=X, y=y).transform(X=X)
        else:
            Xnew = selector.fit(X=X, y=y).transform(X=X)

        # save selected features for each iteration to text file
        with open(os.path.join(savepath, 'selected_features.txt'), 'w') as f:
            features_selected = Xnew.columns.tolist()
            for feature in features_selected:
                f.write(str(feature)+'\n')

        train_scores = dict()
        test_scores = dict()
        train_sizes = list()
        y = np.array(y)
        for n_features in range(len(features_selected)):
            train_sizes.append(n_features+1)
            Xnew_subset = Xnew.iloc[:, 0:n_features+1]

            cv_number = 1
            Xnew_subset = np.array(Xnew_subset)
            if n_features+1 == 1:
                Xnew_subset.reshape(-1, 1)

            for trains, tests in zip(train_inds, test_inds):
                model = model.fit(Xnew_subset[trains], y[trains])
                train_vals = model.predict(Xnew_subset[trains])
                test_vals = model.predict(Xnew_subset[tests])
                train_scores[cv_number] = scoring._score_func(train_vals, y[trains])
                test_scores[cv_number] = scoring._score_func(test_vals, y[tests])
                cv_number += 1
            train_mean.append(np.mean(list(train_scores.values())))
            train_stdev.append(np.std(list(train_scores.values())))
            test_mean.append(np.mean(list(test_scores.values())))
            test_stdev.append(np.std(list(test_scores.values())))

        train_sizes = np.array(train_sizes)
        train_mean = np.array(train_mean)
        train_stdev = np.array(train_stdev)
        test_mean = np.array(test_mean)
        test_stdev = np.array(test_stdev)

        datadict = {"train_sizes": train_sizes,
                    "features selected": features_selected,
                    "train_mean": train_mean,
                    "train_std": train_stdev,
                    "test_mean": test_mean,
                    "test_std": test_stdev}
        pd.DataFrame().from_dict(data=datadict).to_excel(os.path.join(savepath, 'feature_learning_curve.xlsx'), index=False)

        if make_plot is True:
            Line().plot_learning_curve(train_sizes=train_sizes,
                                       train_mean=train_mean,
                                       test_mean=test_mean,
                                       train_stdev=train_stdev,
                                       test_stdev=test_stdev,
                                       learning_curve_type='feature_learning_curve',
                                       score_name=score_name,
                                       savepath=savepath)

        return

    def _setup_savedir(self, savepath):
        now = datetime.now()
        dirname = 'LearningCurve'
        dirname = f"{dirname}_{now.month:02d}_{now.day:02d}" \
                        f"_{now.hour:02d}_{now.minute:02d}_{now.second:02d}"
        if savepath == None:
            splitdir = os.getcwd()
        else:
            splitdir = os.path.join(savepath, dirname)
        if not os.path.exists(splitdir):
            os.mkdir(splitdir)
        self.splitdir = splitdir
        return splitdir