Source code for mastml.learning_curve

"""
This module contains methods to construct learning curves, which evaluate some cross-validation performance metric (e.g. RMSE)
as a function of amount of training data (i.e. a sample learning curve) or as a function of the number of features used
in the fitting (i.e. a feature learning curve).
"""

import numpy as np
import pandas as pd
import warnings
import logging
import os

from sklearn.model_selection import learning_curve
from sklearn.feature_selection import f_regression

from mastml.legos import feature_selectors as fs

# Ignore the harmless warning about the gelsd driver on mac.
warnings.filterwarnings(action="ignore", module="scipy",
                        message="^internal gelsd")

log = logging.getLogger('mastml')

[docs]def sample_learning_curve(X, y, estimator, cv, scoring, Xgroups=None):
    """
    Method that calculates data used to plot a sample learning curve, e.g. the RMSE of a cross-validation routine using a
    specified model and a given fraction of the total training data

    Args:
        X: (numpy array), array of X data values

        y: (numpy array), array of y data values

        estimator: (scikit-learn model object), a scikit-learn model used for fitting

        cv: (scikit-learn cross validation object), a scikit-learn cross validation object to construct train/test splits

        scoring: (scikit-learn metric object), a scikit-learn metric to use as a scorer

        Xgroups: (list), list of row indices corresponding to each group

    Returns:
        train_sizes: (numpy array), array of fractions of training data used in learning curve

        train_mean: (numpy array), array of means of training data scores for each training data fraction

        test_mean: (numpy array), array of means of testing data scores for each training data fraction

        train_stdev: (numpy array), array of standard deviations of training data scores for each training data fraction

        test_stdev: (numpy array), array of standard deviations of testing data scores for each training data fraction

    """

    train_sizes = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
    if Xgroups.shape[0] > 0:
        Xgroups = np.array(Xgroups).reshape(-1, )
    else:
        Xgroups = np.zeros(len(y))

    train_sizes, train_scores, valid_scores = learning_curve(estimator=estimator, X=X, y=y, train_sizes=train_sizes,
                                                             scoring=scoring, cv=cv, groups=Xgroups)
    train_mean = np.mean(train_scores, axis=1)
    test_mean = np.mean(valid_scores, axis=1)
    train_stdev = np.std(train_scores, axis=1)
    test_stdev = np.std(valid_scores, axis=1)

    return train_sizes, train_mean, test_mean, train_stdev, test_stdev

[docs]def feature_learning_curve(X, y, estimator, cv, scoring, selector_name, savepath, n_features_to_select=None, Xgroups=None):
    """
    Method that calculates data used to plot a feature learning curve, e.g. the RMSE of a cross-validation routine using a
    specified model and a given number of features

    Args:
        X: (numpy array), array of X data values

        y: (numpy array), array of y data values

        estimator: (scikit-learn model object), a scikit-learn model used for fitting

        cv: (scikit-learn cross validation object), a scikit-learn cross validation object to construct train/test splits

        scoring: (scikit-learn metric object), a scikit-learn metric to use as a scorer

        selector_name: (str), name of a scikit-learn or MAST-ML feature selection routine

        n_features_to_select: (int), total number of features to select, i.e. stopping criterion for number of features

        Xgroups: (list), list of row indices corresponding to each group

    Returns:
        train_sizes: (numpy array), array of fractions of training data used in learning curve

        train_mean: (numpy array), array of means of training data scores for each number of features

        test_mean: (numpy array), array of means of testing data scores for each number of features

        train_stdev: (numpy array), array of standard deviations of training data scores for each number of features

        test_stdev: (numpy array), array of standard deviations of testing data scores for each number of features

    """
    if Xgroups.shape[0] > 0:
        Xgroups = np.array(Xgroups).reshape(-1, )
    else:
        Xgroups = np.zeros(len(y))
    train_mean = list()
    train_stdev = list()
    test_mean = list()
    test_stdev = list()
    if not n_features_to_select:
        n_features_to_select = X.shape[1].tolist()
    train_sizes = range(n_features_to_select)
    train_sizes = [1+f for f in train_sizes]
    features_selected = list()
    n_features = list()
    for feature in train_sizes:
        n_features.append(feature)
        if selector_name == 'RFE':
            log.warning("Using RFE as feature selector does not support a custom CV or grouping scheme. Your learning"
                        "curve will be generated properly, but will not use the custom CV or grouping scheme")
            try:
                Xnew = fs.name_to_constructor[selector_name](estimator, feature).fit(X, y).transform(X)
            except RuntimeError:
                log.error("You have specified an estimator for RFE that does not have a coef_ or feature_importances_ attribute. "
                          "Acceptable models to use with RFE include: LinearRegression, Lasso, SVR, DecisionTreeRegressor, "
                          "RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, etc.")
        elif selector_name == 'SelectKBest':
            log.warning("Using SelectKBest as feature selector does not support a custom estimator model, CV or grouping scheme. "
                        "Your learning curve will be generated properly, but will not use the custom model, CV or grouping scheme")
            Xnew = fs.name_to_constructor[selector_name](f_regression, feature).fit(X, y).transform(X)
        elif selector_name == 'MASTMLFeatureSelector':
            Xnew = fs.name_to_constructor[selector_name](estimator, feature, cv).fit(X, y, savepath, pd.DataFrame(Xgroups)).transform(X)
        elif selector_name == 'SequentialFeatureSelector':
            log.warning("Using SequentialFeatureSelector as feature selector does not support a custom CV or grouping scheme. "
                        "Your learning curve will be generated properly, but will not use the custom CV or grouping scheme")
            Xnew = fs.name_to_constructor[selector_name](estimator, feature).fit(pd.DataFrame(X), pd.DataFrame(y)).transform(X)
        elif selector_name == None:
            log.warning("A selector name for learning curve calculation was not found. Defaulting to using the "
                        "MASTMLFeatureSelector for learning curve")
            Xnew = fs.name_to_constructor["MASTMLFeatureSelector"](estimator, feature, cv).fit(X, y, pd.DataFrame(Xgroups)).transform(X)
        else:
            log.error("You have specified an invalid selector_name for learning curve. Either leave blank to use the default"
                      " MASTMLFeatureSelector or use one of SelectKBest, RFE, SequentialFeatureSelector, MASTMLFeatureSelector")
            exit()
        # Need to use arrays to avoid indexing issues when leaving out validation data
        features_selected.append(Xnew.columns.tolist())

        Xnew = np.array(Xnew)
        y = np.array(y)
        Xgroups = np.array(Xgroups)
        cv_number=1
        train_scores = dict()
        test_scores = dict()
        for trains, tests in cv.split(Xnew, y, Xgroups):
            model = estimator.fit(Xnew[trains], y[trains])
            train_vals = model.predict(Xnew[trains])
            test_vals = model.predict(Xnew[tests])
            train_scores[cv_number] = scoring._score_func(train_vals, y[trains])
            test_scores[cv_number] = scoring._score_func(test_vals, y[tests])
            cv_number += 1
        train_mean.append(np.mean(list(train_scores.values())))
        train_stdev.append(np.std(list(train_scores.values())))
        test_mean.append(np.mean(list(test_scores.values())))
        test_stdev.append(np.std(list(test_scores.values())))

    datadict = {"n_features": n_features, "features selected": features_selected}
    pd.DataFrame().from_dict(data=datadict).to_csv(os.path.join(savepath, 'features_selected_in_learning_curve.csv'))

    return np.array(train_sizes), np.array(train_mean), np.array(test_mean), np.array(train_stdev), np.array(test_stdev)