Source code for mastml.metrics

"""
This module contains constructors for different model score metrics. Most model metrics are obtained from scikit-learn,
while others are custom variations.

The full list of score functions in scikit-learn can be found at: http://scikit-learn.org/stable/modules/model_evaluation.html
"""

import numpy as np
import sklearn.feature_selection as fs
import sklearn.metrics as sm
from sklearn.linear_model import LinearRegression # for r2_score_noint

classification_metrics = {
    'accuracy':           (True, sm.accuracy_score),
    'f1_binary':          (True, lambda yt, yp: sm.f1_score(yt, yp, average='binary')),
    'f1_macro':           (True, lambda yt, yp: sm.f1_score(yt, yp, average='macro')),
    'f1_micro':           (True, lambda yt, yp: sm.f1_score(yt, yp, average='micro')),
    'f1_samples':         (True, lambda yt, yp: sm.f1_score(yt, yp, average='samples')),
    'f1_weighted':        (True, lambda yt, yp: sm.f1_score(yt, yp, average='weighted')),
    'log_loss':           (False, sm.log_loss),
    'precision_binary':   (True, lambda yt, yp: sm.precision_score(yt, yp, average='binary')),
    'precision_macro':    (True, lambda yt, yp: sm.precision_score(yt, yp, average='macro')),
    'precision_micro':    (True, lambda yt, yp: sm.precision_score(yt, yp, average='micro')),
    'precision_samples':  (True, lambda yt, yp: sm.precision_score(yt, yp, average='samples')),
    'precision_weighted': (True, lambda yt, yp: sm.precision_score(yt, yp, average='weighted')),
    'recall_binary':      (True, lambda yt, yp: sm.recall_score(yt, yp, average='binary')),
    'recall_macro':       (True, lambda yt, yp: sm.recall_score(yt, yp, average='macro')),
    'recall_micro':       (True, lambda yt, yp: sm.recall_score(yt, yp, average='micro')),
    'recall_samples':     (True, lambda yt, yp: sm.recall_score(yt, yp, average='samples')),
    'recall_weighted':    (True, lambda yt, yp: sm.recall_score(yt, yp, average='weighted')),
    'roc_auc':            (True, sm.roc_auc_score),
}

regression_metrics = {
    'explained_variance':     (True, sm.explained_variance_score),
    'mean_absolute_error':    (False, sm.mean_absolute_error),
    'mean_squared_error':     (False, sm.mean_squared_error),
    'mean_squared_log_error': (False, sm.mean_squared_log_error),
    'median_absolute_error':  (False, sm.median_absolute_error),
    'R2': (True, sm.r2_score)
}

[docs]def r2_score_noint(y_true, y_pred): """ Method that calculates the R^2 value without fitting the y-intercept Args: y_true: (numpy array), array of true y data values y_pred: (numpy array), array of predicted y data values Returns: (float): score of R^2 with no y-intercept """ lr = LinearRegression(fit_intercept=False) y_true = np.array(y_true).reshape(-1,1) # turn it from an n-vector to nx1-matrix lr.fit(y_true, y_pred) return lr.score(y_true, y_pred)
regression_metrics['R2_noint'] = (True, r2_score_noint)
[docs]def r2_score_fitted(y_true, y_pred): """ Method that calculates the R^2 value Args: y_true: (numpy array), array of true y data values y_pred: (numpy array), array of predicted y data values Returns: (float): score of R^2 """ lr = LinearRegression(fit_intercept=True) y_true = np.array(y_true).reshape(-1,1) # turn it from an n-vector to nx1-matrix lr.fit(y_true, y_pred) return lr.score(y_true, y_pred)
regression_metrics['R2_fitted'] = (True, r2_score_fitted)
[docs]def root_mean_squared_error(y_true, y_pred): """ Method that calculates the root mean squared error (RMSE) Args: y_true: (numpy array), array of true y data values y_pred: (numpy array), array of predicted y data values Returns: (float): score of RMSE """ return sm.mean_squared_error(y_true, y_pred)**0.5
regression_metrics['root_mean_squared_error'] = (False, root_mean_squared_error) # TODO: consider two rmse/stdev metrics: one that uses stdev of full data set, and one that uses stdev of data per split
[docs]def rmse_over_stdev(y_true, y_pred, train_y=None): """ Method that calculates the root mean squared error (RMSE) of a set of data, divided by the standard deviation of the training data set. Args: y_true: (numpy array), array of true y data values y_pred: (numpy array), array of predicted y data values train_y: (numpy array), array of training y data values Returns: (float): score of RMSE divided by standard deviation of training data """ if train_y is not None: stdev = np.std(train_y) else: stdev = np.std(y_true) rmse = root_mean_squared_error(y_true, y_pred) return rmse / stdev
regression_metrics['rmse_over_stdev'] = (False, rmse_over_stdev)
[docs]def adjusted_r2_score(y_true, y_pred, n_features=None): """ Method that calculates the adjusted R^2 value Args: y_true: (numpy array), array of true y data values y_pred: (numpy array), array of predicted y data values n_features: (int), number of features used in the fit Returns: (float): score of adjusted R^2 """ r2 = sm.r2_score(y_true, y_pred) # n is sample size n = len(y_true) # p is number of features p = n_features try: r2_score_adj = 1 - (((1-r2)*(n-1))/(n-p-1)) except: # No n_features given, just output NaN r2_score_adj = 'NaN' return r2_score_adj
regression_metrics['R2_adjusted'] = (True, adjusted_r2_score) classification_score_funcs = { 'chi2': fs.chi2, # Compute chi-squared stats between each non-negative feature and class. 'f_classif': fs.f_classif, # Compute the ANOVA F-value for the provided sample. 'mutual_info_classif': fs.mutual_info_classif, # Estimate mutual information for a discrete target variable. } regression_score_funcs = { 'f_regression': fs.f_regression, # Univariate linear regression tests. 'mutual_info_regression': fs.mutual_info_regression, # Estimate mutual information for a continuous target variable. } nice_names = { # classification: 'accuracy': 'Accuracy', 'f1_binary': '$F_1$', 'f1_macro': 'f1_macro', 'f1_micro': 'f1_micro', 'f1_samples': 'f1_samples', 'f1_weighted': 'f1_weighted', 'log_loss': 'log_loss', 'precision_binary': 'Precision', 'precision_macro': 'prec_macro', 'precision_micro': 'prec_micro', 'precision_samples': 'prec_samples', 'precision_weighted': 'prec_weighted', 'recall_binary': 'Recall', 'recall_macro': 'rcl_macro', 'recall_micro': 'rcl_micro', 'recall_samples': 'rcl_samples', 'recall_weighted': 'rcl_weighted', 'roc_auc': 'ROC_AUC', # regression: 'explained_variance': 'expl_var', 'mean_absolute_error': 'MAE', 'mean_squared_error': 'MSE', 'mean_squared_log_error': 'MSLE', 'median_absolute_error': 'MedAE', 'root_mean_squared_error': 'RMSE', 'rmse_over_stdev': r'RMSE/$\sigma_y$', 'R2': '$R^2$', 'R2_noint': '$R^2_{noint}$', 'R2_adjusted': '$R^2_{adjusted}$' }
[docs]def check_and_fetch_names(metric_names, is_classification): """ Method that checks whether chosen metrics to evaluate models are appropriate for user-specified models (e.g. classification vs. regression models) Args: metric_names: (numpy array), array of true y data values is_classification: (bool), whether the task is a classification task Returns: functions (dict): dict containing the appropriate metric objects (e.g. classification vs. regression metrics) """ " Ensures all metrics are appropriate for task " task = 'classification' if is_classification else 'regression' metrics_dict = classification_metrics if is_classification else regression_metrics functions = {} for name in metric_names: if name not in metrics_dict: raise Exception(f"Metric '{name}' is not supported for {task}.\n" f"Valid metrics for {task}: {list(metrics_dict.keys())}") functions[name] = metrics_dict[name] return functions