"""
This module contains classes used for generating different types of analysis plots
Scatter:
This class contains a variety of scatter plot types, e.g. parity (predicted vs. true) plots
Error:
This class contains plotting methods used to better quantify the model errors and uncertainty quantification.
Histogram:
This class contains methods for constructing histograms of data distributions and visualization of model residuals.
Line:
This class contains methods for making line plots, e.g. for constructing learning curves of model performance vs.
amount of data or number of features.
"""
import warnings
import math
import os
import pandas as pd
import numpy as np
try:
from collections.abc import Iterable
except ImportError:
from collections import Iterable
from math import log, ceil
import scipy
from scipy.stats import gaussian_kde, norm
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.exceptions import NotFittedError
from mastml.metrics import Metrics
from mastml.error_analysis import ErrorUtils
import matplotlib
from matplotlib import pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from matplotlib.figure import Figure, figaspect
from matplotlib.font_manager import FontProperties
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1 import make_axes_locatable
try:
import statsmodels.api as sm
except:
print('statsmodels is an optional dependency. If you want to create QQ plots for error analysis, do pip install statsmodels')
matplotlib.rc('font', size=18, family='sans-serif') # set all font to bigger
matplotlib.rc('figure', autolayout=True) # turn on autolayout
warnings.filterwarnings(action="ignore")
[docs]class Scatter():
"""
Class to generate scatter plots, such as parity plots showing true vs. predicted data values
Args:
None
Methods:
plot_predicted_vs_true: method to plot a parity plot
Args:
y_true: (pd.Series), series of true y data
y_pred: (pd.Series), series of predicted y data
savepath: (str), string denoting the save path for the figure image
data_type: (str), string denoting the data type (e.g. train, test, leaveout)
x_label: (str), string denoting the true and predicted property name
metrics_list: (list), list of strings of metric names to evaluate and include on the figure
show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)
Returns:
None
plot_best_worst_split: method to find the best and worst split in an evaluation set and plot them together
Args:
savepath: (str), string denoting the save path for the figure image
data_type: (str), string denoting the data type (e.g. train, test, leaveout)
x_label: (str), string denoting the true and predicted property name
metrics_list: (list), list of strings of metric names to evaluate and include on the figure
show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)
Returns:
None
plot_best_worst_per_point: method to find all of the best and worst data points from an evaluation set and plot them together
Args:
savepath: (str), string denoting the save path for the figure image
data_type: (str), string denoting the data type (e.g. train, test, leaveout)
x_label: (str), string denoting the true and predicted property name
metrics_list: (list), list of strings of metric names to evaluate and include on the figure
show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)
Returns:
None
plot_predicted_vs_true_bars: method to plot the average predicted value of each data point from an evaluation set with error bars denoting the standard deviation in predicted values
Args:
savepath: (str), string denoting the save path for the figure image
data_type: (str), string denoting the data type (e.g. train, test, leaveout)
x_label: (str), string denoting the true and predicted property name
metrics_list: (list), list of strings of metric names to evaluate and include on the figure
show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)
Returns:
None
plot_metric_vs_group: method to plot the metric value for each group during e.g. a LeaveOneGroupOut data split
Args:
savepath: (str), string denoting the save path for the figure image
data_type: (str), string denoting the data type (e.g. train, test, leaveout)
show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)
Returns:
None
"""
[docs] @classmethod
def plot_predicted_vs_true(cls, y_true, y_pred, savepath, data_type, x_label, metrics_list=None, show_figure=False,
ebars=None, file_extension='.csv', image_dpi=250, groups=None):
# Make the dataframe/array 1D if it isn't
y_true = check_dimensions(y_true)
y_pred = check_dimensions(y_pred)
# Set image aspect ratio:
fig, ax = make_fig_ax()
# gather max and min
maxx = max(np.nanmax(y_true), np.nanmax(y_pred))
minn = min(np.nanmin(y_true), np.nanmin(y_pred))
_set_tick_labels(ax, maxx, minn)
if groups is not None:
groups_unique = np.unique(groups)
colors = ['blue', 'green', 'red', 'purple', 'orange', 'black', 'grey']
shapes = ['o', 's', '^', 'x', '<', '>', 'h']
count = 0
lap = 0
for group in groups_unique:
ax.scatter(np.array(y_true)[np.where(groups==group)], np.array(y_pred)[np.where(groups==group)], c=colors[count],
marker=shapes[lap], zorder=2, s=100, alpha=0.7, label=group)
if count < len(colors)-1:
count += 1
else:
lap += 1
count = 0
ax.legend(loc='best', fontsize=8)
else:
ax.scatter(y_true, y_pred, c='b', edgecolor='darkblue', zorder=2, s=100, alpha=0.7)
if ebars is not None:
if groups is not None:
groups_unique = np.unique(groups)
colors = ['blue', 'green', 'red', 'purple', 'orange', 'black', 'grey']
shapes = ['o', 's', '^', 'x', '<', '>', 'h']
count = 0
lap = 0
for group in groups_unique:
ax.errorbar(np.array(y_true)[np.where(groups == group)], np.array(y_pred)[np.where(groups == group)],
yerr=np.array(ebars)[np.where(groups == group)], fmt=shapes[lap], markerfacecolor=colors[count],
markeredgecolor=colors[count], ecolor=colors[count], markersize=10, alpha=0.7, capsize=3)
if count < len(colors) - 1:
count += 1
else:
lap += 1
count = 0
ax.legend(loc='best', fontsize=8)
else:
ax.errorbar(y_true, y_pred, yerr=ebars, fmt='o',
markerfacecolor='blue', markeredgecolor='black', ecolor='blue',
markersize=10, alpha=0.7, capsize=3)
# draw dashed horizontal line
ax.plot([minn, maxx], [minn, maxx], 'k--', lw=2, zorder=1)
ax.set_xlabel('True ' + x_label, fontsize=14)
ax.set_ylabel('Predicted ' + x_label, fontsize=14)
if metrics_list is None:
# Use some default metric set
metrics_list = ['r2_score', 'mean_absolute_error', 'root_mean_squared_error', 'rmse_over_stdev']
stats_dict = Metrics(metrics_list=metrics_list).evaluate(y_true=y_true, y_pred=y_pred)
if groups is not None:
stats_dict_group = dict()
for group in groups_unique:
stats_dict_group[group] = Metrics(metrics_list=metrics_list).evaluate(y_true=np.array(y_true)[np.where(groups==group)],
y_pred=np.array(y_pred)[np.where(groups==group)])
stats_dict_group['Overall'] = Metrics(metrics_list=metrics_list).evaluate(y_true=y_true, y_pred=y_pred)
stats_group_df = pd.DataFrame().from_dict(stats_dict_group, orient='index', columns=metrics_list)
if file_extension == '.xlsx':
stats_group_df.to_excel(os.path.join(savepath, str(data_type)+ '_stats_pergroup_summary' +'.xlsx'))
elif file_extension == '.csv':
stats_group_df.to_csv(os.path.join(savepath, str(data_type)+'_stats_pergroup_summary' +'.csv'))
cls.plot_metric_vs_group(groups_unique, stats_group_df, metrics_list, savepath, data_type, show_figure, file_extension, image_dpi)
plot_stats(fig, stats_dict, x_align=0.65, y_align=0.90, fontsize=12)
if ebars is not None:
if groups is not None:
fig.savefig(os.path.join(savepath, 'parity_plot_withcalibratederrorbars_grouplabels_' + str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')
else:
fig.savefig(os.path.join(savepath, 'parity_plot_withcalibratederrorbars_' + str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')
df = pd.DataFrame({'y true': y_true,
'y pred': y_pred,
'y err': ebars})
if file_extension == '.xlsx':
df.to_excel(os.path.join(savepath, 'parity_plot_withcalibratederrorbars_' + str(data_type) + '.xlsx'), index=False)
elif file_extension == '.csv':
df.to_csv(os.path.join(savepath, 'parity_plot_withcalibratederrorbars_' + str(data_type) + '.csv'), index=False)
else:
if groups is not None:
fig.savefig(os.path.join(savepath, 'parity_plot_grouplabels_' + str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')
else:
fig.savefig(os.path.join(savepath, 'parity_plot_'+str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')
df = pd.DataFrame({'y true': y_true,
'y pred': y_pred})
if file_extension == '.xlsx':
df.to_excel(os.path.join(savepath, 'parity_plot_' + str(data_type) + '.xlsx'), index=False)
elif file_extension == '.csv':
df.to_csv(os.path.join(savepath, 'parity_plot_' + str(data_type) + '.csv'), index=False)
if show_figure == True:
plt.show()
else:
plt.close()
return
[docs] @classmethod
def plot_best_worst_split(cls, savepath, data_type, x_label, metrics_list, show_figure=False, file_extension='.csv', image_dpi=250):
dirs = os.listdir(savepath)
splitdirs = [d for d in dirs if 'split_' in d and '.png' not in d]
stats_files_dict = dict()
for splitdir in splitdirs:
if file_extension == '.xlsx':
stats_files_dict[splitdir] = pd.read_excel(os.path.join(os.path.join(savepath, splitdir), data_type + '_stats_summary.xlsx'), engine='openpyxl').to_dict('records')[0]
elif file_extension == '.csv':
stats_files_dict[splitdir] = \
pd.read_csv(os.path.join(os.path.join(savepath, splitdir), data_type + '_stats_summary.csv')).to_dict('records')[0]
# Find best/worst splits based on RMSE value
rmse_best = 10**20
rmse_worst = 0
for split, stats_dict in stats_files_dict.items():
if stats_dict['root_mean_squared_error'] < rmse_best:
best_split = split
rmse_best = stats_dict['root_mean_squared_error']
if stats_dict['root_mean_squared_error'] > rmse_worst:
worst_split = split
rmse_worst = stats_dict['root_mean_squared_error']
if file_extension == '.xlsx':
if data_type == 'test':
y_true_best = pd.read_excel(os.path.join(os.path.join(savepath, best_split), 'y_test.xlsx'), engine='openpyxl')
y_pred_best = pd.read_excel(os.path.join(os.path.join(savepath, best_split), 'y_pred.xlsx'), engine='openpyxl')
y_true_worst = pd.read_excel(os.path.join(os.path.join(savepath, worst_split), 'y_test.xlsx'), engine='openpyxl')
y_pred_worst = pd.read_excel(os.path.join(os.path.join(savepath, worst_split), 'y_pred.xlsx'), engine='openpyxl')
elif data_type == 'train':
y_true_best = pd.read_excel(os.path.join(os.path.join(savepath, best_split), 'y_train.xlsx'), engine='openpyxl')
y_pred_best = pd.read_excel(os.path.join(os.path.join(savepath, best_split), 'y_pred_train.xlsx'), engine='openpyxl')
y_true_worst = pd.read_excel(os.path.join(os.path.join(savepath, worst_split), 'y_train.xlsx'), engine='openpyxl')
y_pred_worst = pd.read_excel(os.path.join(os.path.join(savepath, worst_split), 'y_pred_train.xlsx'), engine='openpyxl')
elif file_extension == '.csv':
if data_type == 'test':
y_true_best = pd.read_csv(os.path.join(os.path.join(savepath, best_split), 'y_test.csv'))
y_pred_best = pd.read_csv(os.path.join(os.path.join(savepath, best_split), 'y_pred.csv'))
y_true_worst = pd.read_csv(os.path.join(os.path.join(savepath, worst_split), 'y_test.csv'))
y_pred_worst = pd.read_csv(os.path.join(os.path.join(savepath, worst_split), 'y_pred.csv'))
elif data_type == 'train':
y_true_best = pd.read_csv(os.path.join(os.path.join(savepath, best_split), 'y_train.csv'))
y_pred_best = pd.read_csv(os.path.join(os.path.join(savepath, best_split), 'y_pred_train.csv'))
y_true_worst = pd.read_csv(os.path.join(os.path.join(savepath, worst_split), 'y_train.csv'))
y_pred_worst = pd.read_csv(os.path.join(os.path.join(savepath, worst_split), 'y_pred_train.csv'))
# Make the dataframe/array 1D if it isn't
y_true_best = check_dimensions(y_true_best)
y_pred_best = check_dimensions(y_pred_best)
y_true_worst = check_dimensions(y_true_worst)
y_pred_worst = check_dimensions(y_pred_worst)
# Set image aspect ratio:
fig, ax = make_fig_ax()
# gather max and min
maxx = max(np.nanmax(y_true_best), np.nanmax(y_pred_best), np.nanmax(y_true_worst), np.nanmax(y_pred_worst))
minn = min(np.nanmin(y_true_best), np.nanmin(y_pred_best), np.nanmin(y_true_worst), np.nanmin(y_pred_worst))
_set_tick_labels(ax, maxx, minn)
ax.scatter(y_true_best, y_pred_best, c='b', edgecolor='darkblue', zorder=2, s=100, alpha=0.7, label='Best split')
ax.scatter(y_true_worst, y_pred_worst, c='r', edgecolor='darkred', zorder=2, s=100, alpha=0.7, label='Worst split')
ax.legend(loc='best')
# draw dashed horizontal line
ax.plot([minn, maxx], [minn, maxx], 'k--', lw=2, zorder=1)
ax.set_xlabel('True ' + x_label, fontsize=14)
ax.set_ylabel('Predicted ' + x_label, fontsize=14)
stats_dict_best = Metrics(metrics_list=metrics_list).evaluate(y_true=y_true_best, y_pred=y_pred_best)
stats_dict_worst = Metrics(metrics_list=metrics_list).evaluate(y_true=y_true_worst, y_pred=y_pred_worst)
plot_stats(fig, stats_dict_best, x_align=0.65, y_align=0.90, font_dict={'fontsize': 12, 'color': 'blue'})
plot_stats(fig, stats_dict_worst, x_align=0.65, y_align=0.50, font_dict={'fontsize': 12, 'color': 'red'})
# Save data to excel file and image
fig.savefig(os.path.join(savepath, 'parity_plot_best_worst_split_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')
if show_figure == True:
plt.show()
else:
plt.close()
return
[docs] @classmethod
def plot_best_worst_per_point(cls, savepath, data_type, x_label, metrics_list, show_figure=False, file_extension='.csv', image_dpi=250):
# Get lists of all ytrue and ypred for each split
dirs = os.listdir(savepath)
splitdirs = [d for d in dirs if 'split_' in d and '.png' not in d]
y_true_list = list()
y_pred_list = list()
index_list = list()
if file_extension == '.xlsx':
for splitdir in splitdirs:
y_true_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_'+str(data_type)+'.xlsx'), engine='openpyxl'))
if data_type == 'test':
y_pred_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_pred.xlsx'), engine='openpyxl'))
index_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'test_inds.xlsx'), engine='openpyxl'))
elif data_type == 'train':
y_pred_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_pred_train.xlsx'), engine='openpyxl'))
index_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'train_inds.xlsx'), engine='openpyxl'))
elif file_extension == '.csv':
for splitdir in splitdirs:
y_true_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_'+str(data_type)+'.csv')))
if data_type == 'test':
y_pred_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_pred.csv')))
index_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'test_inds.csv')))
elif data_type == 'train':
y_pred_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_pred_train.csv')))
index_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'train_inds.csv')))
all_y_true = list()
all_y_pred = list()
all_abs_residuals = list()
all_indices = list()
for yt, y_pred, indices in zip(y_true_list, y_pred_list, index_list):
yt = np.array(check_dimensions(yt))
y_pred = np.array(check_dimensions(y_pred))
indices = np.array(check_dimensions(indices))
abs_residuals = abs(yt-y_pred)
all_y_true.append(yt)
all_y_pred.append(y_pred)
all_abs_residuals.append(abs_residuals)
all_indices.append(indices)
all_y_true_flat = np.array([item for sublist in all_y_true for item in sublist])
all_y_pred_flat = np.array([item for sublist in all_y_pred for item in sublist])
all_indices_flat = np.array([item for sublist in all_indices for item in sublist])
all_residuals_flat = np.array([item for sublist in all_abs_residuals for item in sublist])
# Loop over indices
unique_inds = np.unique(all_indices_flat)
bests = list()
worsts = list()
y_true_unique = list()
for ind in unique_inds:
y_true_unique.append(np.mean(all_y_true_flat[np.where(all_indices_flat==ind)[0]]))
best = min(abs(all_y_pred_flat[np.where(all_indices_flat==ind)] - all_y_true_flat[np.where(all_indices_flat==ind)]))
worst = max(abs(all_y_pred_flat[np.where(all_indices_flat==ind)] - all_y_true_flat[np.where(all_indices_flat==ind)]))
bests.append(all_y_pred_flat[np.where(all_residuals_flat == best)])
worsts.append(all_y_pred_flat[np.where(all_residuals_flat == worst)])
y_true_unique = np.array(y_true_unique).ravel()
bests = np.array(bests).ravel()
worsts = np.array(worsts).ravel()
stats_dict_best = Metrics(metrics_list=metrics_list).evaluate(y_true=y_true_unique, y_pred=bests)
stats_dict_worst = Metrics(metrics_list=metrics_list).evaluate(y_true=y_true_unique, y_pred=worsts)
fig, ax = make_fig_ax(x_align=0.65)
# gather max and min
maxx = float(max([max(y_true_unique), max(bests), max(worsts)]))
minn = float(min([min(y_true_unique), min(bests), min(worsts)]))
# draw dashed horizontal line
ax.plot([minn, maxx], [minn, maxx], 'k--', lw=2, zorder=1)
# set axis labels
ax.set_xlabel('True '+x_label, fontsize=16)
ax.set_ylabel('Predicted '+x_label, fontsize=16)
# set tick labels
#maxx = round(float(max1), rounder(max1-min1))
#minn = round(float(min1), rounder(max1-min1))
_set_tick_labels(ax, maxx, minn)
ax.scatter(y_true_unique, bests, c='b', alpha=0.7, label='best all points', edgecolor='darkblue', zorder=2, s=100)
ax.scatter(y_true_unique, worsts, c='r', alpha=0.7, label='worst all points', edgecolor='darkred', zorder=2, s=70)
ax.legend(loc='best', fontsize=12)
#plot_stats(fig, avg_stats, x_align=x_align, y_align=0.51, fontsize=10)
plot_stats(fig, stats_dict_best, x_align=0.65, y_align=0.90, font_dict={'fontsize': 10, 'color': 'b'})
plot_stats(fig, stats_dict_worst, x_align=0.65, y_align=0.50, font_dict={'fontsize': 10, 'color': 'r'})
# Save data to excel file and image
fig.savefig(os.path.join(savepath, 'parity_plot_best_worst_eachpoint_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')
if show_figure == True:
plt.show()
else:
plt.close()
return
[docs] @classmethod
def plot_predicted_vs_true_bars(cls, savepath, x_label, data_type, metrics_list, show_figure=False, ebars=None,
file_extension='.csv', image_dpi=250, groups=None):
# Get lists of all ytrue and ypred for each split
dirs = os.listdir(savepath)
splitdirs = [d for d in dirs if 'split_' in d and '.png' not in d]
y_true_list = list()
y_pred_list = list()
data_ind_list = list()
groups_list = list()
if file_extension == '.xlsx':
for splitdir in splitdirs:
y_true_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_'+str(data_type)+'.xlsx'), engine='openpyxl'))
if data_type == 'test':
y_pred_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_pred.xlsx'), engine='openpyxl'))
data_ind_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'test_inds.xlsx'), engine='openpyxl'))
if groups is not None:
groups_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'test_groups.xlsx'), engine='openpyxl'))
elif data_type == 'train':
y_pred_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_pred_train.xlsx'), engine='openpyxl'))
data_ind_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'train_inds.xlsx'), engine='openpyxl'))
if groups is not None:
groups_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'train_groups.xlsx'), engine='openpyxl'))
elif data_type == 'leaveout':
y_pred_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_pred_leaveout.xlsx'), engine='openpyxl'))
data_ind_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'leaveout_inds.xlsx'), engine='openpyxl'))
if groups is not None:
groups_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'leaveout_groups.xlsx'), engine='openpyxl'))
elif file_extension == '.csv':
for splitdir in splitdirs:
y_true_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_' + str(data_type) + '.csv')))
if data_type == 'test':
y_pred_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_pred.csv')))
data_ind_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'test_inds.csv')))
if groups is not None:
groups_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'test_groups.csv')))
elif data_type == 'train':
y_pred_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_pred_train.csv')))
data_ind_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'train_inds.csv')))
if groups is not None:
groups_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'train_groups.csv')))
elif data_type == 'leaveout':
y_pred_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_pred_leaveout.csv')))
data_ind_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'leaveout_inds.csv')))
if groups is not None:
groups_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'leaveout_groups.csv')))
all_y_true = list()
all_y_pred = list()
all_data_inds = list()
all_groups = list()
if groups is not None:
for groups in groups_list:
all_groups.append(np.array(groups))
for yt, y_pred, data_inds in zip(y_true_list, y_pred_list, data_ind_list):
yt = np.array(check_dimensions(yt))
y_pred = np.array(check_dimensions(y_pred))
data_inds = np.array(check_dimensions(data_inds))
all_y_true.append(yt)
all_y_pred.append(y_pred)
all_data_inds.append(data_inds)
df_all = pd.DataFrame({'all_y_true': np.array([item for sublist in all_y_true for item in sublist]),
'all_y_pred': np.array([item for sublist in all_y_pred for item in sublist]),
'all_data_inds': np.array([item for sublist in all_data_inds for item in sublist])})
if groups is not None:
df_all['all_groups']= np.array([item for sublist in all_groups for item in sublist])
if ebars is not None:
df_all['ebars'] = np.array(ebars)
df_all_grouped = df_all.groupby('all_data_inds', as_index=False, sort=False)
df_avg = df_all_grouped.mean()
df_std = df_all_grouped.std()
# make fig and ax, use x_align when placing text so things don't overlap
x_align = 0.64
fig, ax = make_fig_ax(x_align=x_align)
trues = df_avg['all_y_true']
preds = df_avg['all_y_pred']
if groups is not None:
# Need to use the indices of df.groupby to get the original list of groups
grps = df_all_grouped.groups
inds = list()
for k, v, in grps.items():
inds.append(v[0])
groups = np.array(df_all['all_groups'])[inds]
# gather max and min
maxx = max(np.nanmax(trues), np.nanmax(preds))
minn = min(np.nanmin(trues), np.nanmin(preds))
# draw dashed horizontal line
ax.plot([minn, maxx], [minn, maxx], 'k--', lw=2, zorder=1)
# set axis labels
ax.set_xlabel('True ' + x_label, fontsize=16)
ax.set_ylabel('Predicted ' + x_label, fontsize=16)
# set tick labels
_set_tick_labels(ax, maxx, minn)
if groups is not None:
groups_unique = np.unique(groups)
colors = ['blue', 'green', 'red', 'purple', 'orange', 'black', 'grey']
shapes = ['o', 's', '^', 'x', '<', '>', 'h']
count = 0
lap = 0
for group in groups_unique:
ax.scatter(np.array(trues)[np.where(groups==group)], np.array(preds)[np.where(groups==group)], c=colors[count],
marker=shapes[lap], zorder=2, s=100, alpha=0.7, label=group)
if count < len(colors)-1:
count += 1
else:
lap += 1
count = 0
ax.legend(loc='best', fontsize=8)
else:
ax.scatter(trues, preds, c='b', edgecolor='darkblue', zorder=2, s=100, alpha=0.7)
if ebars is not None:
if groups is not None:
groups_unique = np.unique(groups)
colors = ['blue', 'green', 'red', 'purple', 'orange', 'black', 'grey']
shapes = ['o', 's', '^', 'x', '<', '>', 'h']
count = 0
lap = 0
for group in groups_unique:
ax.errorbar(np.array(trues)[np.where(groups == group)], np.array(preds)[np.where(groups == group)],
yerr=np.array(df_avg['ebars'])[np.where(groups == group)], fmt=shapes[lap], markerfacecolor=colors[count],
markeredgecolor=colors[count], ecolor=colors[count], markersize=10, alpha=0.7, capsize=3)
if count < len(colors) - 1:
count += 1
else:
lap += 1
count = 0
ax.legend(loc='best', fontsize=8)
else:
ax.errorbar(trues, preds, yerr=df_avg['ebars'], fmt='o',
markerfacecolor='blue', markeredgecolor='black', ecolor='blue',
markersize=10, alpha=0.7, capsize=3)
else:
if groups is not None:
groups_unique = np.unique(groups)
colors = ['blue', 'green', 'red', 'purple', 'orange', 'black', 'grey']
shapes = ['o', 's', '^', 'x', '<', '>', 'h']
count = 0
lap = 0
for group in groups_unique:
ax.errorbar(np.array(trues)[np.where(groups == group)], np.array(preds)[np.where(groups == group)],
yerr=np.array(df_std['all_y_pred'])[np.where(groups == group)], fmt=shapes[lap], markerfacecolor=colors[count],
markeredgecolor=colors[count], ecolor=colors[count], markersize=10, alpha=0.7, capsize=3)
if count < len(colors) - 1:
count += 1
else:
lap += 1
count = 0
ax.legend(loc='best', fontsize=8)
else:
ax.errorbar(trues, preds, yerr=df_std['all_y_pred'], fmt='o',
markerfacecolor='blue', markeredgecolor='black', ecolor='blue',
markersize=10, alpha=0.7, capsize=3)
stats_files_dict = dict()
for splitdir in splitdirs:
if file_extension == '.xlsx':
stats_files_dict[splitdir] = pd.read_excel(os.path.join(os.path.join(savepath, splitdir), data_type + '_stats_summary.xlsx'), engine='openpyxl').to_dict('records')[0]
elif file_extension == '.csv':
stats_files_dict[splitdir] = pd.read_csv(os.path.join(os.path.join(savepath, splitdir), data_type + '_stats_summary.csv')).to_dict('records')[0]
metrics_list = list(stats_files_dict[splitdir].keys())
avg_stats = dict()
for metric in metrics_list:
stats = list()
for splitdir in splitdirs:
stats.append(stats_files_dict[splitdir][metric])
avg_stats[metric] = (np.mean(stats), np.std(stats))
plot_stats(fig, avg_stats, x_align=x_align, y_align=0.90)
if ebars is not None:
if groups is not None:
fig.savefig(os.path.join(savepath, 'parity_plot_allsplits_average_withcalibratederrorbars_grouplabels_' + str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')
else:
fig.savefig(os.path.join(savepath, 'parity_plot_allsplits_average_withcalibratederrorbars_' + str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')
df = pd.DataFrame({'y true': trues,
'average predicted values': preds,
'error bar values': df_avg['ebars']})
if file_extension == '.xlsx':
df.to_excel(os.path.join(savepath, 'parity_plot_allsplits_average_withcalibratederrorbars_' + str(data_type) + '.xlsx'), index=False)
elif file_extension == '.csv':
df.to_csv(os.path.join(savepath, 'parity_plot_allsplits_average_withcalibratederrorbars_' + str(data_type) + '.csv'), index=False)
else:
if groups is not None:
fig.savefig(os.path.join(savepath, 'parity_plot_allsplits_average_grouplabels_' + str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')
else:
fig.savefig(os.path.join(savepath, 'parity_plot_allsplits_average_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')
df = pd.DataFrame({'y true': trues,
'average predicted values': preds,
'error bar values': df_std['all_y_pred']})
if file_extension == '.xlsx':
df.to_excel(os.path.join(savepath, 'parity_plot_allsplits_average_'+str(data_type)+'.xlsx'), index=False)
elif file_extension == '.csv':
df.to_csv(os.path.join(savepath, 'parity_plot_allsplits_average_' + str(data_type) + '.csv'), index=False)
df_stats = pd.DataFrame().from_dict(avg_stats)
if file_extension == '.xlsx':
df_stats.to_excel(os.path.join(savepath, str(data_type)+'_average_stdev_stats_summary.xlsx'), index=False)
elif file_extension == '.csv':
df_stats.to_csv(os.path.join(savepath, str(data_type) + '_average_stdev_stats_summary.csv'), index=False)
if show_figure == True:
plt.show()
else:
plt.close()
return
[docs] @classmethod
def plot_metric_vs_group(cls, groups, stats_group_df, metrics_list, savepath, data_type, show_figure, file_extension='.csv', image_dpi=250):
for metric in metrics_list:
stats = list()
for group in groups:
stats.append(stats_group_df[metric][group])
avg_stats = {metric: (np.mean(stats), np.std(stats))}
# make fig and ax, use x_align when placing text so things don't overlap
x_align = 0.64
fig, ax = make_fig_ax(x_align=x_align)
# do the actual plotting
ax.scatter(groups, stats, c='blue', alpha=0.7, edgecolor='darkblue', zorder=2, s=100)
# set axis labels
ax.set_xlabel('Group', fontsize=14)
ax.set_ylabel(metric, fontsize=14)
ax.set_xticklabels(labels=groups, fontsize=14)
plot_stats(fig, avg_stats, x_align=x_align, y_align=0.90)
fig.savefig(os.path.join(savepath, str(metric)+'_value_per_group_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')
if show_figure == True:
plt.show()
else:
plt.close()
return
[docs]class Error():
"""
Class to make plots related to model error assessment and uncertainty quantification
Args:
None
Methods:
plot_normalized_error: Method to plot the normalized residual errors of a model prediction
Args:
residuals: (pd.Series), series containing the true errors (model residuals)
savepath: (str), string denoting the save path to save the figure to
data_type: (str), string denoting the data type, e.g. train, test, leftout
model_errors: (pd.Series), series containing the predicted model errors (optional, default None)
show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)
Returns:
None
plot_cumulative_normalized_error: Method to plot the cumulative normalized residual errors of a model prediction
Args:
residuals: (pd.Series), series containing the true errors (model residuals)
savepath: (str), string denoting the save path to save the figure to
data_type: (str), string denoting the data type, e.g. train, test, leftout
model_errors: (pd.Series), series containing the predicted model errors (optional, default None)
show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)
Returns:
None
plot_rstat: Method for plotting the r-statistic distribution (true divided by predicted error)
Args:
savepath: (str), string denoting the save path to save the figure to
data_type: (str), string denoting the data type, e.g. train, test, leftout
residuals: (pd.Series), series containing the true errors (model residuals)
model_errors: (pd.Series), series containing the predicted model errors
show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)
is_calibrated: (bool), whether or not the model errors have been recalibrated (default False)
Returns:
None
plot_rstat_uncal_cal_overlay: Method for plotting the r-statistic distribution for two cases together: the as-obtained uncalibrated model errors and calibrated errors
Args:
savepath: (str), string denoting the save path to save the figure to
data_type: (str), string denoting the data type, e.g. train, test, leftout
residuals: (pd.Series), series containing the true errors (model residuals)
model_errors: (pd.Series), series containing the predicted model errors
model_errors_cal: (pd.Series), series containing the calibrated predicted model errors
show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)
Returns:
None
plot_real_vs_predicted_error: Sometimes called the RvE plot, or residual vs. error plot, this method plots the binned RMS residuals as a function of the binned model errors
Args:
savepath: (str), string denoting the save path to save the figure to
model: (mastml.models object), a MAST-ML model object, e.g. SklearnModel or EnsembleModel
data_type: (str), string denoting the data type, e.g. train, test, leftout
model_errors: (pd.Series), series containing the predicted model errors
residuals: (pd.Series), series containing the true errors (model residuals)
dataset_stdev: (float), the standard deviation of the training dataset
show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)
is_calibrated: (bool), whether or not the model errors have been recalibrated (default False)
well_sampled_fraction: (float), number denoting whether a bin qualifies as well-sampled or not. Default to 0.025 (2.5% of total samples). Only affects visuals, not fitting
Returns:
None
plot_real_vs_predicted_error_uncal_cal_overlay: Method for making the residual vs. error plot for two cases together: using the as-obtained uncalibrated model errors and calibrated errors
Args:
savepath: (str), string denoting the save path to save the figure to
model: (mastml.models object), a MAST-ML model object, e.g. SklearnModel or EnsembleModel
data_type: (str), string denoting the data type, e.g. train, test, leftout
model_errors: (pd.Series), series containing the predicted model errors
model_errors_cal: (pd.Series), series containing the calibrated predicted model errors
residuals: (pd.Series), series containing the true errors (model residuals)
dataset_stdev: (float), the standard deviation of the training dataset
show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)
well_sampled_fraction: (float), number denoting whether a bin qualifies as well-sampled or not. Default to 0.025 (2.5% of total samples). Only affects visuals, not fitting
Returns:
None
"""
@classmethod
def plot_qq(cls, residuals, savepath, data_type, show_figure, image_dpi=250):
x_align = 0.64
fig, ax = make_fig_ax(x_align=x_align)
fig = sm.qqplot(data=residuals, dist=scipy.stats.distributions.norm, line='45', fit=True,
ax=ax, markerfacecolor='blue', markeredgecolor='darkblue',
zorder=2, markersize=10, alpha=0.7)
ax.set_xlim(-5, 5)
ax.set_ylim(-5, 5)
ax.set_xlabel('Normal distribution quantiles', fontsize=14)
ax.set_ylabel('Model residual quantiles', fontsize=14)
ax.get_lines()[1].set_color("black")
ax.get_lines()[1].set_linewidth("1.5")
ax.get_lines()[1].set_linestyle("--")
ax.xaxis.get_label().set_fontsize(12)
ax.yaxis.get_label().set_fontsize(12)
fig.savefig(os.path.join(savepath, 'qq_plot_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')
if show_figure is True:
plt.show()
else:
plt.close()
return
[docs] @classmethod
def plot_normalized_error(cls, residuals, savepath, data_type, model_errors=None, show_figure=False, file_extension='.csv', image_dpi=250):
x_align = 0.64
fig, ax = make_fig_ax(x_align=x_align)
mu = 0
sigma = 1
residuals[residuals == 0.0] = 10**-6
normalized_residuals = residuals / np.std(residuals)
density_residuals = gaussian_kde(normalized_residuals)
x = np.linspace(mu - 5 * sigma, mu + 5 * sigma, residuals.shape[0])
ax.plot(x, norm.pdf(x, mu, sigma), linewidth=4, color='blue', label="Analytical Gaussian")
ax.plot(x, density_residuals(x), linewidth=4, color='green', label="Model Residuals")
maxx = 5
minn = -5
if model_errors is not None:
model_errors[model_errors == 0.0] = 0.0001
rstat = residuals / model_errors
density_errors = gaussian_kde(rstat)
maxy = max(max(density_residuals(x)), max(norm.pdf(x, mu, sigma)), max(density_errors(x)))
miny = min(min(density_residuals(x)), min(norm.pdf(x, mu, sigma)), max(density_errors(x)))
ax.plot(x, density_errors(x), linewidth=4, color='purple', label="Model Errors")
# Save data to csv file
data_dict = {"Plotted x values": x, "model_errors": model_errors,
# "analytical gaussian (plotted y blue values)": norm.pdf(x, mu, sigma),
"residuals": residuals,
"model normalized residuals (plotted y green values)": density_residuals(x),
"model errors (plotted y purple values)": density_errors(x)}
else:
# Save data to csv file
data_dict = {"x values": x,
# "analytical gaussian": norm.pdf(x, mu, sigma),
"model normalized residuals (plotted y green values)": density_residuals(x)}
maxy = max(max(density_residuals(x)), max(norm.pdf(x, mu, sigma)))
miny = min(min(density_residuals(x)), min(norm.pdf(x, mu, sigma)))
if file_extension == '.xlsx':
pd.DataFrame(data_dict).to_excel(os.path.join(savepath, 'normalized_error_data_'+str(data_type)+'.xlsx'))
elif file_extension == '.csv':
pd.DataFrame(data_dict).to_csv(os.path.join(savepath, 'normalized_error_data_' + str(data_type) + '.csv'))
ax.legend(loc=0, fontsize=12, frameon=False)
ax.set_xlabel(r"$\mathrm{x}/\mathit{\sigma}$", fontsize=18)
ax.set_ylabel("Probability density", fontsize=18)
_set_tick_labels_different(ax, maxx, minn, maxy, miny)
fig.savefig(os.path.join(savepath, 'normalized_errors_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')
if show_figure is True:
plt.show()
else:
plt.close()
return
[docs] @classmethod
def plot_cumulative_normalized_error(cls, residuals, savepath, data_type, model_errors=None, show_figure=False, file_extension='.csv', image_dpi=250):
x_align = 0.64
fig, ax = make_fig_ax(x_align=x_align)
analytic_gau = np.random.normal(0, 1, 10000)
analytic_gau = abs(analytic_gau)
n_analytic = np.arange(1, len(analytic_gau) + 1) / np.float(len(analytic_gau))
X_analytic = np.sort(analytic_gau)
residuals[residuals == 0.0] = 10 ** -6
normalized_residuals = abs((residuals) / np.std(residuals))
n_residuals = np.arange(1, len(normalized_residuals) + 1) / np.float(len(normalized_residuals))
X_residuals = np.sort(normalized_residuals) # r"$\mathrm{Predicted \/ Value}, \mathit{eV}$"
ax.set_xlabel(r"$\mathrm{x}/\mathit{\sigma}$", fontsize=18)
ax.set_ylabel("Fraction", fontsize=18)
ax.step(X_residuals, n_residuals, linewidth=3, color='green', label="Model Residuals")
ax.step(X_analytic, n_analytic, linewidth=3, color='blue', label="Analytical Gaussian")
ax.set_xlim([0, 5])
if model_errors is not None:
model_errors[model_errors == 0.0] = 0.0001
rstat = abs((residuals) / model_errors)
n_errors = np.arange(1, len(rstat) + 1) / np.float(len(rstat))
X_errors = np.sort(rstat)
ax.step(X_errors, n_errors, linewidth=3, color='purple', label="Model Errors")
# Save data to csv file
data_dict = { # "Analytical Gaussian values": analytic_gau,
# "Analytical Gaussian (sorted, blue data)": X_analytic,
"residuals": residuals,
"normalized residuals": normalized_residuals,
"Model Residuals (sorted, green data)": X_residuals,
"Model error values (r value: (ytrue-ypred)/(model error avg))": rstat,
"Model errors (sorted, purple values)": X_errors}
else:
# Save data to csv file
data_dict = { # "x analytical": X_analytic,
# "analytical gaussian": n_analytic,
"Model Residuals (sorted, green data)": X_residuals,
"model residuals": n_residuals}
# Save this way to avoid issue with different array sizes in data_dict
df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data_dict.items()]))
if file_extension == '.xlsx':
df.to_excel(os.path.join(savepath, 'cumulative_normalized_errors_'+str(data_type)+'.xlsx'), index=False)
elif file_extension == '.csv':
df.to_csv(os.path.join(savepath, 'cumulative_normalized_errors_'+str(data_type)+'.csv'), index=False)
ax.legend(loc=0, fontsize=14, frameon=False)
xlabels = np.linspace(2, 3, 3)
ylabels = np.linspace(0.9, 1, 2)
axin = zoomed_inset_axes(ax, 2.5, loc=7)
axin.step(X_residuals, n_residuals, linewidth=3, color='green', label="Model Residuals")
axin.step(X_analytic, n_analytic, linewidth=3, color='blue', label="Analytical Gaussian")
if model_errors is not None:
axin.step(X_errors, n_errors, linewidth=3, color='purple', label="Model Errors")
axin.set_xticklabels(xlabels, fontsize=8, rotation=90)
axin.set_yticklabels(ylabels, fontsize=8)
axin.set_xlim([2, 3])
axin.set_ylim([0.9, 1])
maxx = 5
minn = 0
maxy = 1.1
miny = 0
_set_tick_labels_different(ax, maxx, minn, maxy, miny)
mark_inset(ax, axin, loc1=1, loc2=2)
fig.savefig(os.path.join(savepath, 'cumulative_normalized_errors_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')
if show_figure is True:
plt.show()
else:
plt.close()
return
[docs] @classmethod
def plot_rstat(cls, savepath, data_type, residuals, model_errors, show_figure=False, is_calibrated=False, image_dpi=250):
# Eliminate model errors with value 0, so that the ratios can be calculated
zero_indices = []
for i in range(0, len(model_errors)):
if model_errors[i] == 0:
zero_indices.append(i)
residuals = np.delete(residuals, zero_indices)
model_errors = np.delete(model_errors, zero_indices)
# make data for gaussian plot
gaussian_x = np.linspace(-5, 5, 1000)
# create plot
x_align = 0.64
fig, ax = make_fig_ax(x_align=x_align)
ax.set_xlabel('residuals / model error estimates')
ax.set_ylabel('relative counts')
ax.hist(residuals/model_errors, bins=30, color='blue', edgecolor='black', density=True)
ax.plot(gaussian_x, stats.norm.pdf(gaussian_x, 0, 1), label='Gaussian mu: 0 std: 1', color='black', linestyle='--', linewidth=1.5)
ax.text(0.05, 0.9, 'mean = %.3f' % (np.mean(residuals / model_errors)), transform=ax.transAxes)
ax.text(0.05, 0.85, 'std = %.3f' % (np.std(residuals / model_errors)), transform=ax.transAxes)
if is_calibrated == False:
calibrate = 'uncalibrated'
if is_calibrated == True:
calibrate = 'calibrated'
fig.savefig(os.path.join(savepath, 'rstat_histogram_'+str(data_type)+'_'+calibrate+'.png'), dpi=image_dpi, bbox_inches='tight')
if show_figure is True:
plt.show()
else:
plt.close()
return
[docs] @classmethod
def plot_rstat_uncal_cal_overlay(cls, savepath, data_type, residuals, model_errors, model_errors_cal,
show_figure=False, image_dpi=250):
# Eliminate model errors with value 0, so that the ratios can be calculated
zero_indices = []
for i in range(0, len(model_errors)):
if model_errors[i] == 0:
zero_indices.append(i)
residuals = np.delete(residuals, zero_indices)
model_errors = np.delete(model_errors, zero_indices)
model_errors_cal = np.delete(model_errors_cal, zero_indices)
# make data for gaussian plot
gaussian_x = np.linspace(-5, 5, 1000)
# create plot
x_align = 0.64
fig, ax = make_fig_ax(x_align=x_align)
ax.set_xlabel('residuals / model error estimates')
ax.set_ylabel('relative counts')
ax.hist(residuals/model_errors, bins=30, color='gray', edgecolor='black', density=True, alpha=0.4)
ax.hist(residuals/model_errors_cal, bins=30, color='blue', edgecolor='black', density=True, alpha=0.4)
ax.plot(gaussian_x, stats.norm.pdf(gaussian_x, 0, 1), label='Gaussian mu: 0 std: 1', color='black', linestyle='--', linewidth=1.5)
ax.text(0.05, 0.9, 'mean = %.3f' % (np.mean(residuals / model_errors)), transform=ax.transAxes, fontdict={'fontsize': 10, 'color': 'gray'})
ax.text(0.05, 0.85, 'std = %.3f' % (np.std(residuals / model_errors)), transform=ax.transAxes, fontdict={'fontsize': 10, 'color': 'gray'})
ax.text(0.05, 0.8, 'mean = %.3f' % (np.mean(residuals / model_errors_cal)), transform=ax.transAxes, fontdict={'fontsize': 10, 'color': 'blue'})
ax.text(0.05, 0.75, 'std = %.3f' % (np.std(residuals / model_errors_cal)), transform=ax.transAxes, fontdict={'fontsize': 10, 'color': 'blue'})
fig.savefig(os.path.join(savepath, 'rstat_histogram_'+str(data_type)+'_uncal_cal_overlay.png'), dpi=image_dpi, bbox_inches='tight')
if show_figure is True:
plt.show()
else:
plt.close()
return
[docs] @classmethod
def plot_real_vs_predicted_error(cls, savepath, model, data_type, model_errors, residuals, dataset_stdev,
show_figure=False, is_calibrated=False, well_sampled_number=30, image_dpi=250):
bin_values, rms_residual_values, num_values_per_bin, number_of_bins, ms_residual_values, var_sq_residual_values = ErrorUtils()._parse_error_data(model_errors=model_errors,
residuals=residuals,
dataset_stdev=dataset_stdev)
model_name = model.model.__class__.__name__
if model_name == 'RandomForestRegressor':
model_type = 'RF'
elif model_name == 'GradientBoostingRegressor':
model_type = 'GBR'
elif model_name == 'ExtraTreesRegressor':
model_type = 'ET'
elif model_name == 'GaussianProcessRegressor':
model_type = 'GPR'
elif model_name == 'BaggingRegressor':
model_type = 'BR'
elif model_name == 'AdaBoostRegressor':
model_type = 'ABR'
if data_type not in ['train', 'test', 'leaveout']:
print('Error: data_test_type must be one of "train", "test" or "leaveout"')
exit()
# Make RF error plot
fig, ax = make_fig_ax(aspect_ratio=0.5, x_align=0.65)
linear = LinearRegression(fit_intercept=True)
# Fit just blue circle data
# Find nan entries
nans = np.argwhere(np.isnan(rms_residual_values)).tolist()
# use nans (which are indices) to delete relevant parts of bin_values and
# rms_residual_values as they can't be used to fit anyway
bin_values_copy = np.empty_like(bin_values)
bin_values_copy[:] = bin_values
rms_residual_values_copy = np.empty_like(rms_residual_values)
rms_residual_values_copy[:] = rms_residual_values
bin_values_copy = np.delete(bin_values_copy, nans)
rms_residual_values_copy = np.delete(rms_residual_values_copy, nans)
num_values_per_bin_copy = np.array(num_values_per_bin)[np.array(num_values_per_bin) != 0]
# Only examine the bins that are well-sampled, i.e. have number of data points in them above a given threshold
#well_sampled_number = round(well_sampled_fraction*np.sum(num_values_per_bin_copy))
rms_residual_values_wellsampled = rms_residual_values_copy[np.where(num_values_per_bin_copy > well_sampled_number)]
bin_values_wellsampled = bin_values_copy[np.where(num_values_per_bin_copy > well_sampled_number)]
num_values_per_bin_wellsampled = num_values_per_bin_copy[np.where(num_values_per_bin_copy > well_sampled_number)]
rms_residual_values_poorlysampled = rms_residual_values_copy[np.where(num_values_per_bin_copy <= well_sampled_number)]
bin_values_poorlysampled = bin_values_copy[np.where(num_values_per_bin_copy <= well_sampled_number)]
num_values_per_bin_poorlysampled = num_values_per_bin_copy[np.where(num_values_per_bin_copy <= well_sampled_number)]
yerr = list()
for i, j, k in zip(var_sq_residual_values, num_values_per_bin, rms_residual_values):
if j > 1:
yerr.append(np.sqrt(i) / (2 * np.sqrt(j) * k))
else:
yerr.append(1)
yerr = np.array(yerr)
yerr_wellsampled = yerr[np.where(num_values_per_bin > well_sampled_number)[0]]
yerr_poorlysampled = yerr[np.where(num_values_per_bin <= well_sampled_number)[0]]
ax.scatter(bin_values_wellsampled, rms_residual_values_wellsampled, s=80, color='blue', alpha=0.7)
ax.scatter(bin_values_poorlysampled, rms_residual_values_poorlysampled, s=40, color='blue', alpha=0.3)
ax.errorbar(bin_values_wellsampled, rms_residual_values_wellsampled, yerr=yerr_wellsampled, ecolor='blue', capsize=2, linewidth=0, elinewidth=1)
ax.errorbar(bin_values_poorlysampled, rms_residual_values_poorlysampled, yerr=yerr_poorlysampled, ecolor='blue', capsize=2, linewidth=0, elinewidth=1, alpha=0.4)
ax.set_xlabel(str(model_type) + ' model errors / dataset stdev', fontsize=12)
ax.set_ylabel('RMS Absolute residuals\n / dataset stdev', fontsize=12)
ax.tick_params(labelsize=10)
if not rms_residual_values_copy.size:
print("---WARNING: ALL ERRORS TOO LARGE FOR PLOTTING---")
exit()
else:
# Fit the line to all data, including the poorly sampled data, and weight data points by number of samples per bin
linear.fit(np.array(bin_values_copy).reshape(-1, 1), rms_residual_values_copy,
sample_weight=num_values_per_bin_copy)
yfit = linear.predict(np.array(bin_values_copy).reshape(-1, 1))
ax.plot(bin_values_copy, yfit, 'k--', linewidth=2)
r2 = r2_score(rms_residual_values_copy, yfit, sample_weight=num_values_per_bin_copy)
slope = linear.coef_
intercept = linear.intercept_
divider = make_axes_locatable(ax)
axbarx = divider.append_axes("top", 1.2, pad=0.12, sharex=ax)
axbarx.bar(x=bin_values, height=num_values_per_bin, width=bin_values[1]-bin_values[0], color='blue', edgecolor='black', alpha=0.7)
axbarx.tick_params(labelsize=10, axis='y')
axbarx.tick_params(labelsize=0, axis='x')
axbarx.set_ylabel('Counts', fontsize=12)
total_samples = sum(num_values_per_bin)
#xmax = max(max(bin_values_copy) + 0.05, 1.6)
#ymax = max(1.3, max(rms_residual_values))
xmax = round(max(bin_values_copy) * 1.10, 2)
ymax = round(max(rms_residual_values) * 1.10, 2)
axbarx.text(0.6 * xmax, round(0.9 * max(num_values_per_bin)), 'Total counts = ' + str(total_samples),
fontsize=10)
ax.set_ylim(bottom=0, top=ymax)
axbarx.set_ylim(bottom=0, top=round(max(num_values_per_bin) + 0.1*max(num_values_per_bin)))
ax.set_xlim(left=0, right=xmax)
ax.text(0.02, 0.9*ymax, 'R$^2$ = %3.2f ' % r2, fontdict={'fontsize': 10, 'color': 'k'})
ax.text(0.02, 0.8*ymax, 'slope = %3.2f ' % slope, fontdict={'fontsize': 10, 'color': 'k'})
ax.text(0.02, 0.7*ymax, 'intercept = %3.2f ' % intercept, fontdict={'fontsize': 10, 'color': 'k'})
# Plot y = x line as reference point
maxx = max(xmax, ymax)
ax.plot([0, maxx], [0, maxx], 'k--', lw=2, zorder=1, color='gray', alpha=0.5)
if is_calibrated == False:
calibrate = 'uncalibrated'
if is_calibrated == True:
calibrate = 'calibrated'
fig.savefig(os.path.join(savepath, str(model_type) + '_residuals_vs_modelerror_' + str(data_type) + '_' + calibrate + '.png'),
dpi=image_dpi, bbox_inches='tight')
if show_figure is True:
plt.show()
else:
plt.close()
return
[docs] @classmethod
def plot_real_vs_predicted_error_uncal_cal_overlay(cls, savepath, model, data_type, model_errors, model_errors_cal,
residuals, dataset_stdev, show_figure=False,
well_sampled_number=30, image_dpi=250):
bin_values_uncal, rms_residual_values_uncal, num_values_per_bin_uncal, number_of_bins_uncal, ms_residual_values_uncal, var_sq_residual_values_uncal = ErrorUtils()._parse_error_data(model_errors=model_errors,
residuals=residuals,
dataset_stdev=dataset_stdev)
bin_values_cal, rms_residual_values_cal, num_values_per_bin_cal, number_of_bins_cal, ms_residual_values_cal, var_sq_residual_values_cal = ErrorUtils()._parse_error_data(model_errors=model_errors_cal,
residuals=residuals,
dataset_stdev=dataset_stdev)
model_name = model.model.__class__.__name__
if model_name == 'RandomForestRegressor':
model_type = 'RF'
elif model_name == 'GradientBoostingRegressor':
model_type = 'GBR'
elif model_name == 'ExtraTreesRegressor':
model_type = 'ET'
elif model_name == 'GaussianProcessRegressor':
model_type = 'GPR'
elif model_name == 'BaggingRegressor':
model_type = 'BR'
elif model_name == 'AdaBoostRegressor':
model_type = 'ABR'
if data_type not in ['train', 'test', 'leaveout']:
print('Error: data_test_type must be one of "train", "test" or "leaveout"')
exit()
# Make RF error plot
fig, ax = make_fig_ax(aspect_ratio=0.5, x_align=0.65)
linear_uncal = LinearRegression(fit_intercept=True)
linear_cal = LinearRegression(fit_intercept=True)
# Only examine the bins that are well-sampled, i.e. have number of data points in them above a given threshold
#well_sampled_number_uncal = round(well_sampled_fraction*np.sum(num_values_per_bin_uncal))
rms_residual_values_wellsampled_uncal = rms_residual_values_uncal[np.where(num_values_per_bin_uncal > well_sampled_number)[0]]
bin_values_wellsampled_uncal = bin_values_uncal[np.where(num_values_per_bin_uncal > well_sampled_number)[0]]
num_values_per_bin_wellsampled_uncal = num_values_per_bin_uncal[np.where(num_values_per_bin_uncal > well_sampled_number)[0]]
rms_residual_values_poorlysampled_uncal = rms_residual_values_uncal[np.where(num_values_per_bin_uncal <= well_sampled_number)[0]]
bin_values_poorlysampled_uncal = bin_values_uncal[np.where(num_values_per_bin_uncal <= well_sampled_number)[0]]
num_values_per_bin_poorlysampled_uncal = num_values_per_bin_uncal[np.where(num_values_per_bin_uncal <= well_sampled_number)[0]]
yerr_uncal = list()
for i, j, k in zip(var_sq_residual_values_uncal, num_values_per_bin_uncal, rms_residual_values_uncal):
if j > 1:
yerr_uncal.append(np.sqrt(i) / (2 * np.sqrt(j) * k))
else:
yerr_uncal.append(1)
yerr_uncal = np.array(yerr_uncal)
yerr_cal = list()
for i, j, k in zip(var_sq_residual_values_cal, num_values_per_bin_cal, rms_residual_values_cal):
if j > 1:
yerr_cal.append(np.sqrt(i) / (2 * np.sqrt(j) * k))
else:
yerr_cal.append(1)
yerr_cal = np.array(yerr_cal)
yerr_wellsampled_uncal = yerr_uncal[np.where(num_values_per_bin_uncal > well_sampled_number)[0]]
yerr_poorlysampled_uncal = yerr_uncal[np.where(num_values_per_bin_uncal <= well_sampled_number)[0]]
#well_sampled_number_cal = round(well_sampled_fraction * np.sum(num_values_per_bin_cal))
rms_residual_values_wellsampled_cal = rms_residual_values_cal[np.where(num_values_per_bin_cal > well_sampled_number)[0]]
bin_values_wellsampled_cal = bin_values_cal[np.where(num_values_per_bin_cal > well_sampled_number)]
num_values_per_bin_wellsampled_cal = num_values_per_bin_cal[np.where(num_values_per_bin_cal > well_sampled_number)[0]]
rms_residual_values_poorlysampled_cal = rms_residual_values_cal[np.where(num_values_per_bin_cal <= well_sampled_number)[0]]
bin_values_poorlysampled_cal = bin_values_cal[np.where(num_values_per_bin_cal <= well_sampled_number)[0]]
num_values_per_bin_poorlysampled_cal = num_values_per_bin_cal[np.where(num_values_per_bin_cal <= well_sampled_number)[0]]
yerr_wellsampled_cal = yerr_cal[np.where(num_values_per_bin_cal > well_sampled_number)[0]]
yerr_poorlysampled_cal = yerr_cal[np.where(num_values_per_bin_cal <= well_sampled_number)[0]]
ax.scatter(bin_values_wellsampled_uncal, rms_residual_values_wellsampled_uncal, s=80, color='gray', edgecolor='gray', alpha=0.7, label='uncalibrated')
ax.scatter(bin_values_poorlysampled_uncal, rms_residual_values_poorlysampled_uncal, s=40, color='gray', edgecolor='gray', alpha=0.4)
ax.errorbar(bin_values_wellsampled_uncal, rms_residual_values_wellsampled_uncal, yerr=yerr_wellsampled_uncal, ecolor='gray', capsize=2, linewidth=0, elinewidth=1)
ax.errorbar(bin_values_poorlysampled_uncal, rms_residual_values_poorlysampled_uncal, yerr=yerr_poorlysampled_uncal, ecolor='gray', capsize=2, linewidth=0, elinewidth=1, alpha=0.4)
ax.scatter(bin_values_wellsampled_cal, rms_residual_values_wellsampled_cal, s=80, color='blue', edgecolor='blue', alpha=0.7, label='calibrated')
ax.scatter(bin_values_poorlysampled_cal, rms_residual_values_poorlysampled_cal, s=40, color='blue', edgecolor='blue', alpha=0.4)
ax.errorbar(bin_values_wellsampled_cal, rms_residual_values_wellsampled_cal, yerr=yerr_wellsampled_cal, ecolor='blue', capsize=2, linewidth=0, elinewidth=1)
ax.errorbar(bin_values_poorlysampled_cal, rms_residual_values_poorlysampled_cal, yerr=yerr_poorlysampled_cal, ecolor='blue', capsize=2, linewidth=0, elinewidth=1, alpha=0.4)
ax.set_xlabel(str(model_type) + ' model errors / dataset stdev', fontsize=12)
ax.set_ylabel('RMS Absolute residuals\n / dataset stdev', fontsize=12)
ax.tick_params(labelsize=10)
# Fit the line to all data, including the poorly sampled data, and weight data points by number of samples per bin
linear_uncal.fit(np.array(bin_values_uncal).reshape(-1, 1), rms_residual_values_uncal,
sample_weight=num_values_per_bin_uncal)
yfit_uncal = linear_uncal.predict(np.array(bin_values_uncal).reshape(-1, 1))
ax.plot(bin_values_uncal, yfit_uncal, 'gray', linewidth=2)
r2_uncal = r2_score(rms_residual_values_uncal, yfit_uncal, sample_weight=num_values_per_bin_uncal)
slope_uncal = linear_uncal.coef_
intercept_uncal = linear_uncal.intercept_
# Fit the line to all data, including the poorly sampled data, and weight data points by number of samples per bin
linear_cal.fit(np.array(bin_values_cal).reshape(-1, 1), rms_residual_values_cal,
sample_weight=num_values_per_bin_cal)
yfit_cal = linear_cal.predict(np.array(bin_values_cal).reshape(-1, 1))
ax.plot(bin_values_cal, yfit_cal, 'blue', linewidth=2)
r2_cal = r2_score(rms_residual_values_cal, yfit_cal, sample_weight=num_values_per_bin_cal)
slope_cal = linear_cal.coef_
intercept_cal = linear_cal.intercept_
divider = make_axes_locatable(ax)
axbarx = divider.append_axes("top", 1.2, pad=0.12, sharex=ax)
axbarx.bar(x=bin_values_uncal, height=num_values_per_bin_uncal, width=bin_values_uncal[1]-bin_values_uncal[0],
color='gray', edgecolor='gray', alpha=0.3)
axbarx.bar(x=bin_values_cal, height=num_values_per_bin_cal, width=bin_values_cal[1] - bin_values_cal[0],
color='blue', edgecolor='blue', alpha=0.3)
axbarx.tick_params(labelsize=10, axis='y')
axbarx.tick_params(labelsize=0, axis='x')
axbarx.set_ylabel('Counts', fontsize=12)
#xmax = max(max(bin_values_uncal) + 0.05, 1.6)
#ymax = max(1.3, max(rms_residual_values_uncal))
xmax = round(max([max(bin_values_uncal), max(bin_values_cal)]) * 1.10, 2)
ymax = round(max([max(rms_residual_values_uncal), max(rms_residual_values_cal)]) * 1.10, 2)
axbarx.text(0.6 * xmax, round(0.9 * max(num_values_per_bin_uncal)), 'Total counts = ' + str(sum(num_values_per_bin_uncal)),
fontsize=10)
ax.set_ylim(bottom=0, top=ymax)
axbarx.set_ylim(bottom=0, top=round(max(num_values_per_bin_uncal) + 0.1*max(num_values_per_bin_uncal)))
ax.set_xlim(left=0, right=xmax)
ax.text(0.02, 0.9*ymax, 'R$^2$ = %3.2f ' % r2_uncal, fontdict={'fontsize': 8, 'color': 'gray'})
ax.text(0.02, 0.8*ymax, 'slope = %3.2f ' % slope_uncal, fontdict={'fontsize': 8, 'color': 'gray'})
ax.text(0.02, 0.7*ymax, 'intercept = %3.2f ' % intercept_uncal, fontdict={'fontsize': 8, 'color': 'gray'})
ax.text(0.02, 0.6*ymax, 'R$^2$ = %3.2f ' % r2_cal, fontdict={'fontsize': 8, 'color': 'blue'})
ax.text(0.02, 0.5*ymax, 'slope = %3.2f ' % slope_cal, fontdict={'fontsize': 8, 'color': 'blue'})
ax.text(0.02, 0.4*ymax, 'intercept = %3.2f ' % intercept_cal, fontdict={'fontsize': 8, 'color': 'blue'})
# Plot y = x line as reference point
maxx = max(xmax, ymax)
ax.plot([0, maxx], [0, maxx], 'k--', lw=2, color='red', alpha=0.5)
ax.legend(loc='lower right', fontsize=8)
fig.savefig(os.path.join(savepath, str(model_type) + '_residuals_vs_modelerror_' + str(data_type) + '_uncal_cal_overlay.png'),
dpi=image_dpi, bbox_inches='tight')
if show_figure is True:
plt.show()
else:
plt.close()
return
[docs]class Histogram():
"""
Class to generate histogram plots, such as histograms of residual values
Args:
None
Methods:
plot_histogram: method to plot a basic histogram of supplied data
Args:
df: (pd.DataFrame), dataframe or series of data to plot as a histogram
savepath: (str), string denoting the save path for the figure image
file_name: (str), string denoting the character of the file name, e.g. train vs. test
x_label: (str), string denoting the property name
show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)
Returns:
None
plot_residuals_histogram: method to plot a histogram of residual values
Args:
y_true: (pd.Series), series of true y data
y_pred: (pd.Series), series of predicted y data
savepath: (str), string denoting the save path for the figure image
file_name: (str), string denoting the character of the file name, e.g. train vs. test
show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)
Returns:
None
_get_histogram_bins: Method to obtain the number of bins to use when plotting a histogram
Args:
df: (pandas Series or numpy array), array of y data used to construct histogram
Returns:
num_bins: (int), the number of bins to use when plotting a histogram
"""
[docs] @classmethod
def plot_histogram(cls, df, savepath, file_name, x_label, show_figure=False, file_extension='.csv', image_dpi=250):
# Make the dataframe 1D if it isn't
df = check_dimensions(df)
# make fig and ax, use x_align when placing text so things don't overlap
x_align = 0.70
fig, ax = make_fig_ax(aspect_ratio=0.5, x_align=x_align)
# Get num_bins using smarter method
num_bins = cls._get_histogram_bins(df=df)
# do the actual plotting
ax.hist(df, bins=num_bins, color='b', edgecolor='k')
# normal text stuff
ax.set_xlabel(x_label, fontsize=14)
ax.set_ylabel('Number of occurrences', fontsize=14)
plot_stats(fig, dict(df.describe()), x_align=x_align, y_align=0.90, fontsize=12)
# Save data to excel file and image
if file_extension == '.xlsx':
df.to_excel(os.path.join(savepath, file_name + '.xlsx'))
df.describe().to_excel(os.path.join(savepath, file_name + '_statistics.xlsx'))
elif file_extension == '.csv':
df.to_csv(os.path.join(savepath, file_name + '.csv'))
df.describe().to_csv(os.path.join(savepath, file_name + '_statistics.csv'))
fig.savefig(os.path.join(savepath, file_name + '.png'), dpi=image_dpi, bbox_inches='tight')
if show_figure == True:
plt.show()
else:
plt.close()
return
[docs] @classmethod
def plot_residuals_histogram(cls, y_true, y_pred, savepath, show_figure=False, file_extension='.csv', image_dpi=250, file_name='residual_histogram'):
y_true = check_dimensions(y_true)
y_pred = check_dimensions(y_pred)
residuals = y_pred-y_true
cls.plot_histogram(df=residuals,
savepath=savepath,
file_name=file_name,
x_label='Residuals',
show_figure=show_figure,
file_extension=file_extension,
image_dpi=image_dpi)
return
@classmethod
def _get_histogram_bins(cls, df):
bin_dividers = np.linspace(df.shape[0], 0.05*df.shape[0], df.shape[0])
bin_list = list()
try:
for divider in bin_dividers:
if divider == 0:
continue
bins = int((df.shape[0])/divider)
if bins < df.shape[0]/2:
bin_list.append(bins)
except:
num_bins = 10
if len(bin_list) > 0:
num_bins = max(bin_list)
else:
num_bins = 10
return num_bins
[docs]class Line():
'''
Class containing methods for constructing line plots
Args:
None
Methods:
plot_learning_curve: Method used to plot both data and feature learning curves
Args:
train_sizes: (numpy array), array of x-axis values, such as fraction of data used or number of features
train_mean: (numpy array), array of training data mean values, averaged over some type/number of CV splits
test_mean: (numpy array), array of test data mean values, averaged over some type/number of CV splits
train_stdev: (numpy array), array of training data standard deviation values, from some type/number of CV splits
test_stdev: (numpy array), array of test data standard deviation values, from some type/number of CV splits
score_name: (str), type of score metric for learning curve plotting; used in y-axis label
learning_curve_type: (str), type of learning curve employed: 'sample_learning_curve' or 'feature_learning_curve'
savepath: (str), path to save the plotted learning curve to
Returns:
None
'''
[docs] @classmethod
def plot_learning_curve(cls, train_sizes, train_mean, test_mean, train_stdev, test_stdev, score_name,
learning_curve_type, savepath, image_dpi=250):
# Set image aspect ratio (do custom for learning curve):
w, h = figaspect(0.75)
fig = Figure(figsize=(w, h))
FigureCanvas(fig)
gs = plt.GridSpec(1, 1)
ax = fig.add_subplot(gs[0:, 0:])
max_x = max(train_sizes)
min_x = min(train_sizes)
max_y, min_y = recursive_max_and_min([
train_mean,
train_mean + train_stdev,
train_mean - train_stdev,
test_mean,
test_mean + test_stdev,
test_mean - test_stdev,
])
max_x = round(float(max_x), rounder(max_x - min_x))
min_x = round(float(min_x), rounder(max_x - min_x))
max_y = round(float(max_y), rounder(max_y - min_y))
min_y = round(float(min_y), rounder(max_y - min_y))
_set_tick_labels_different(ax, max_x, min_x, max_y, min_y)
# plot and collect handles h1 and h2 for making legend
h1 = ax.plot(train_sizes, train_mean, '-o', color='blue', markersize=10, alpha=0.7)[0]
ax.fill_between(train_sizes, train_mean - train_stdev, train_mean + train_stdev,
alpha=0.1, color='blue')
h2 = ax.plot(train_sizes, test_mean, '-o', color='red', markersize=10, alpha=0.7)[0]
ax.fill_between(train_sizes, test_mean - test_stdev, test_mean + test_stdev,
alpha=0.1, color='red')
ax.legend([h1, h2], ['train score', 'validation score'], loc='center right', fontsize=12)
if learning_curve_type == 'data_learning_curve':
ax.set_xlabel('Number of training data points', fontsize=16)
elif learning_curve_type == 'feature_learning_curve':
ax.set_xlabel('Number of features selected', fontsize=16)
else:
raise ValueError(
'The param "learning_curve_type" must be either "data_learning_curve" or "feature_learning_curve"')
ax.set_ylabel(score_name, fontsize=16)
fig.savefig(os.path.join(savepath, learning_curve_type + '.png'), dpi=image_dpi, bbox_inches='tight')
return
[docs]class Classification():
'''
Classification plots
Args:
None
Methods:
plot_classification_report: Method used to plot the classification report
Args:
savepath: (str), path to save the plotted learning curve to
data_type: (str), string denoting the data type, e.g. train, test, leftout
y_true: (pd.Series), series of true y data
y_pred: (pd.Series), series of predicted y data
show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)
Returns:
None
'''
[docs] def createClassificationReport(savepath, report_dict, show_figure=False, data_type=''):
# Parse report_dict data for export
class_metrics = list(report_dict.items())[:-3]
all_metric_names = list(class_metrics[0][1].keys())
class_names = [x[0] for x in class_metrics]
all_class_values = [list(x[1].values()) for x in class_metrics]
class_values_no_support = np.array([list(x[1].values())[:-1] for x in class_metrics])
metrics = list(report_dict.items())[-3:]
meta_names = [m[0] for m in metrics]
meta_values = np.array([([m[1]] if isinstance(m[1], float) else list(m[1].values())) for m in metrics], dtype=object)
report_as_array = []
report_as_array.append([""]+all_metric_names)
for i in range(len(class_names)):
report_as_array.append([class_names[i]] + all_class_values[i])
for i in range(len(meta_names)):
report_as_array.append([meta_names[i]] + meta_values[i])
# EXCEL
pd.DataFrame(report_as_array).to_excel(os.path.join(savepath, 'classification_report_'+str(data_type)+'.xlsx'), index=False, header=None)
# PLOT
plot_metric_names = all_metric_names[:-1]
dimensions = max(5, len(class_names))
fig, ax = plt.subplots(figsize=(dimensions, dimensions))
im = ax.imshow(class_values_no_support)
ax.set_xticks(np.arange(len(plot_metric_names)))
ax.set_yticks(np.arange(len(class_names)))
ax.set_xticklabels(plot_metric_names)
ax.set_yticklabels(class_names)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
font = {'family': 'serif',
'weight': 'normal',
'size': 10,
}
for i in range(len(class_names)):
for j in range(len(plot_metric_names)):
text = ax.text(j, i, round(class_values_no_support[i, j], 3),
ha="center",
va="center",
color="w",
fontdict=font)
ax.set_title("Classification Report")
fig.tight_layout()
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="15%", pad=0.05)
plt.colorbar(im, cax=cax)
fig.savefig(os.path.join(savepath, 'classification_report_'+str(data_type)+'.png'), bbox_inches='tight')
plt.show() if show_figure else plt.close()
[docs] @classmethod
def plot_classification_report(cls, savepath, data_type, y_true, y_pred, show_figure):
report = classification_report(y_true=y_true, y_pred=y_pred, output_dict=True)
Classification.createClassificationReport(savepath, report, show_figure, data_type)
[docs] @classmethod
def doProba(cls, model, X_test):
print("="*4 + "doProba" + "="*4)
print("model:")
print(model)
# print("X_test:")
# print(X_test)
print(f"hasattr(model, 'predict_proba') {hasattr(model, 'predict_proba')}")
if hasattr(model, 'predict_proba'):
try:
print("predict_proba")
foo = model.predict_proba(X_test)
print(foo)
except NotFittedError as e:
print(e)
print("="*18)
[docs]def plot_feature_occurrence(savepath, feature, occurrence):
"""
Function to plot the occurrence of each feature in all of the splits
Args:
savepath: (str), string denoting the path to save output to
feature: (list) the list of features
occurrence: (list) the list of occurrence of each feature
Returns:
None.
"""
plt.tick_params(axis='y', labelsize=10)
plt.barh(feature, occurrence)
plt.xlabel("Occurrence")
plt.ylabel("Feature")
plt.title("Occurrence of Features")
plt.savefig(os.path.join(savepath, 'Feature_Occurrence.png'))
plt.clf()
return
[docs]def plot_avg_score_vs_occurrence(savepath, occurrence, score, std_score):
"""
Function to plot the average score of each feature against their occurrence in all of the splits
Args:
savepath: (str), string denoting the path to save output to
occurrence: (list), the list of occurrence of each feature
score: (list), the list of the feature ranking score of each feature
std_score(list), the list of standard deviation of the feature ranking score
Returns:
None.
"""
plt.plot(occurrence, score, marker='o')
plt.fill_between(occurrence, score - std_score, score + std_score, alpha=0.1, color='blue')
plt.title('Avg Score vs Occurrence')
plt.xlabel('Occurrence')
plt.ylabel('Avg Score')
plt.savefig(os.path.join(savepath + '/AvgScoreVsOccurrence.png'))
plt.clf()
return
[docs]def make_plots(plots, y_true, y_pred, groups, dataset_stdev, metrics, model, residuals, model_errors, has_model_errors,
savepath, data_type, X_test=None, show_figure=False, recalibrate_errors=False, model_errors_cal=None, splits_summary=False,
file_extension='.csv', image_dpi=250):
"""
Helper function to make collections of different types of plots after a single or multiple data splits are evaluated.
Args:
plots: (list of str), list denoting which types of plots to make. Viable entries are "Scatter", "Histogram", "Error"
y_true: (pd.Series), series containing the true y data
y_pred: (pd.Series), series containing the predicted y data
groups: (list), list denoting the group label for each data point
dataset_stdev: (float), the dataset standard deviation
metrics: (list of str), list denoting the metric names to evaluate. See mastml.metrics.Metrics.metrics_zoo_ for full list
model: (mastml.models object), a MAST-ML model object, e.g. SklearnModel or EnsembleModel
residuals: (pd.Series), series containing the residuals (true model errors)
model_errors: (pd.Series), series containing the as-obtained uncalibrated model errors
has_model_errors: (bool), whether the model type used can be subject to UQ and thus have model errors calculated
savepath: (str), string denoting the path to save output to
data_type: (str), string denoting the data type analyzed, e.g. train, test, leftout
show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)
recalibrate_errors: (bool), whether or not the model errors have been recalibrated (default False)
model_errors_cal: (pd.Series), series containing the calibrated predicted model errors
splits_summary: (bool), whether or not the data used in the plots comes from a collection of many splits (default False), False denotes a single split folder
Returns:
None.
"""
if 'Histogram' in plots:
try:
Histogram.plot_residuals_histogram(y_true=y_true,
y_pred=y_pred,
savepath=savepath,
file_name='residual_histogram_'+str(data_type),
show_figure=show_figure,
file_extension=file_extension,
image_dpi=image_dpi)
except:
print('Warning: unable to make Histogram.plot_residuals_histogram. Skipping...')
if 'Scatter' in plots:
try:
Scatter.plot_predicted_vs_true(y_true=y_true,
y_pred=y_pred,
savepath=savepath,
x_label='values',
data_type=data_type,
metrics_list=metrics,
show_figure=show_figure,
ebars=None,
file_extension=file_extension,
image_dpi=image_dpi,
groups=None)
if groups is not None:
Scatter.plot_predicted_vs_true(y_true=y_true,
y_pred=y_pred,
savepath=savepath,
x_label='values',
data_type=data_type,
metrics_list=metrics,
show_figure=show_figure,
ebars=None,
file_extension=file_extension,
image_dpi=image_dpi,
groups=groups)
if recalibrate_errors == True:
Scatter.plot_predicted_vs_true(y_true=y_true,
y_pred=y_pred,
savepath=savepath,
x_label='values',
data_type=data_type,
metrics_list=metrics,
show_figure=show_figure,
ebars=model_errors_cal,
file_extension=file_extension,
image_dpi=image_dpi,
groups=None)
if groups is not None:
Scatter.plot_predicted_vs_true(y_true=y_true,
y_pred=y_pred,
savepath=savepath,
x_label='values',
data_type=data_type,
metrics_list=metrics,
show_figure=show_figure,
ebars=model_errors_cal,
file_extension=file_extension,
image_dpi=image_dpi,
groups=groups)
except:
print('Warning: unable to make Scatter.plot_predicted_vs_true plot. Skipping...')
if splits_summary is True:
if data_type != 'leaveout':
try:
Scatter.plot_best_worst_split(savepath=savepath,
data_type=data_type,
x_label='values',
metrics_list=metrics,
show_figure=show_figure,
file_extension=file_extension,
image_dpi=image_dpi)
except:
print('Warning: unable to make Scatter.plot_best_worst_split plot. Skipping...')
try:
Scatter.plot_best_worst_per_point(savepath=savepath,
data_type=data_type,
x_label='values',
metrics_list=metrics,
show_figure=show_figure,
file_extension=file_extension,
image_dpi=image_dpi)
except:
print('Warning: unable to make Scatter.plot_best_worst_per_point plot. Skipping...')
try:
Scatter.plot_predicted_vs_true_bars(savepath=savepath,
data_type=data_type,
x_label='values',
metrics_list=metrics,
show_figure=show_figure,
file_extension=file_extension,
image_dpi=image_dpi,
groups=None)
if groups is not None:
Scatter.plot_predicted_vs_true_bars(savepath=savepath,
data_type=data_type,
x_label='values',
metrics_list=metrics,
show_figure=show_figure,
file_extension=file_extension,
image_dpi=image_dpi,
groups=groups)
if recalibrate_errors == True:
Scatter.plot_predicted_vs_true_bars(savepath=savepath,
data_type=data_type,
x_label='values',
metrics_list=metrics,
show_figure=show_figure,
ebars=model_errors_cal,
file_extension=file_extension,
image_dpi=image_dpi,
groups=None)
if groups is not None:
Scatter.plot_predicted_vs_true_bars(savepath=savepath,
data_type=data_type,
x_label='values',
metrics_list=metrics,
show_figure=show_figure,
ebars=model_errors_cal,
file_extension=file_extension,
image_dpi=image_dpi,
groups=groups)
except:
print('Warning: unable to make Scatter.plot_predicted_vs_true_bars plot. Skipping...')
if 'Error' in plots:
try:
Error.plot_qq(residuals=residuals,
savepath=savepath,
data_type=data_type,
show_figure=show_figure,
image_dpi=image_dpi)
except:
print('Warning: unable to make Error.plot_qq plot. Skipping...')
try:
Error.plot_normalized_error(residuals=residuals,
savepath=savepath,
data_type=data_type,
model_errors=model_errors,
show_figure=show_figure,
file_extension=file_extension,
image_dpi=image_dpi)
except:
print('Warning: unable to make Error.plot_normalized_error plot. Skipping...')
try:
Error.plot_cumulative_normalized_error(residuals=residuals,
savepath=savepath,
data_type=data_type,
model_errors=model_errors,
show_figure=show_figure,
file_extension=file_extension,
image_dpi=image_dpi)
except:
print('Warning: unable to make Error.plot_cumulative_normalized_error plot. Skipping...')
if has_model_errors is True:
try:
Error.plot_rstat(savepath=savepath,
data_type=data_type,
model_errors=model_errors,
residuals=residuals,
show_figure=show_figure,
is_calibrated=False,
image_dpi=image_dpi)
except:
print('Warning: unable to make Error.plot_rstat plot. Skipping...')
try:
Error.plot_real_vs_predicted_error(savepath=savepath,
model=model,
data_type=data_type,
model_errors=model_errors,
residuals=residuals,
dataset_stdev=dataset_stdev,
show_figure=show_figure,
is_calibrated=False,
image_dpi=image_dpi)
except:
print('Warning: unable to make Error.plot_real_vs_predicted_error plot. Skipping...')
if recalibrate_errors is True:
try:
Error.plot_rstat(savepath=savepath,
data_type=data_type,
residuals=residuals,
model_errors=model_errors_cal,
show_figure=show_figure,
is_calibrated=True,
image_dpi=image_dpi)
except:
print('Warning: unable to make Error.plot_rstat plot. Skipping...')
try:
Error.plot_rstat_uncal_cal_overlay(savepath=savepath,
data_type=data_type,
residuals=residuals,
model_errors=model_errors,
model_errors_cal=model_errors_cal,
show_figure=False,
image_dpi=image_dpi)
except:
print('Warning: unable to make Error.plot_rstat_uncal_cal_overlay plot. Skipping...')
try:
Error.plot_real_vs_predicted_error(savepath=savepath,
model=model,
data_type=data_type,
residuals=residuals,
model_errors=model_errors_cal,
dataset_stdev=dataset_stdev,
show_figure=show_figure,
is_calibrated=True,
image_dpi = image_dpi)
except:
print('Warning: unable to make Error.plot_real_vs_predicted_error plot. Skipping...')
try:
Error.plot_real_vs_predicted_error_uncal_cal_overlay(savepath=savepath,
model=model,
data_type=data_type,
model_errors=model_errors,
model_errors_cal=model_errors_cal,
residuals=residuals,
dataset_stdev=dataset_stdev,
show_figure=False,
image_dpi=image_dpi)
except:
print('Warning: unable to make Error.plot_real_vs_predicted_error_uncal_cal_overlay plot. Skipping...')
if 'Classification' in plots:
Classification.plot_classification_report(savepath=savepath, data_type=data_type, y_true=y_true, y_pred=y_pred, show_figure=show_figure)
Classification.doProba(model=model, X_test=X_test)
return
[docs]def check_dimensions(y):
"""
Method to check the dimensions of supplied data. Plotters need data to be 1D and often data is passed in as 2D
Args:
y: (numpy array or pd.DataFrame), array or dataframe of data used for plotting
Returns:
y: (pd.Series), series that is now 1D
"""
if len(y.shape) > 1:
if type(y) == pd.core.frame.DataFrame:
y = pd.DataFrame.squeeze(y)
elif type(y) == np.ndarray:
y = pd.DataFrame(y.ravel()).squeeze()
#y = y.ravel()
else:
if type(y) == np.ndarray:
y = pd.DataFrame(y).squeeze()
return y
[docs]def reset_index(y):
return pd.DataFrame(np.array(y))
[docs]def trim_array(arr_list):
"""
Method used to trim a set of arrays to make all arrays the same shape
Args:
arr_list: (list), list of numpy arrays, where arrays are different sizes
Returns:
arr_list: (), list of trimmed numpy arrays, where arrays are same size
"""
# TODO: a better way to handle arrays with very different shapes? Otherwise average only uses # of points of smallest array
# Need to make arrays all same shapes if they aren't
sizes = [arr.shape[0] for arr in arr_list]
size_min = min(sizes)
arr_list_ = list()
for i, arr in enumerate(arr_list):
if arr.shape[0] > size_min:
while arr.shape[0] > size_min:
arr = np.delete(arr, -1)
arr_list_.append(arr)
arr_list = arr_list_
return arr_list
[docs]def rounder(delta):
"""
Method to obtain number of decimal places to report on plots
Args:
delta: (float), a float representing the change in two y values on a plot, used to obtain the plot axis spacing size
Return:
(int), an integer denoting the number of decimal places to use
"""
if 0.001 <= delta < 0.01:
return 3
elif 0.01 <= delta < 0.1:
return 2
elif 0.1 <= delta < 1:
return 1
elif 1 <= delta < 100000:
return 0
else:
return 0
[docs]def stat_to_string(name, value, nice_names):
"""
Method that converts a metric object into a string for displaying on a plot
Args:
name: (str), long name of a stat metric or quantity
value: (float), value of the metric or quantity
Return:
(str), a string of the metric name, adjusted to look nicer for inclusion on a plot
"""
" Stringifies the name value pair for display within a plot "
if name in nice_names:
name = nice_names[name]
else:
name = name.replace('_', ' ')
# has a name only
if not value:
return name
# has a mean and std
if isinstance(value, tuple):
mean, std = value
return f'{name}:' + '\n\t' + f'{mean:.3f}' + r'$\pm$' + f'{std:.3f}'
# has a name and value only
if isinstance(value, int) or (isinstance(value, float) and value % 1 == 0):
return f'{name}: {int(value)}'
if isinstance(value, float):
return f'{name}: {value:.3f}'
return f'{name}: {value}' # probably a string
[docs]def plot_stats(fig, stats, x_align=0.65, y_align=0.90, font_dict=dict(), fontsize=14):
"""
Method that prints stats onto the plot. Goes off screen if they are too long or too many in number.
Args:
fig: (matplotlib figure object), a matplotlib figure object
stats: (dict), dict of statistics to be included with a plot
x_align: (float), float denoting x position of where to align display of stats on a plot
y_align: (float), float denoting y position of where to align display of stats on a plot
font_dict: (dict), dict of matplotlib font options to alter display of stats on plot
fontsize: (int), the fontsize of stats to display on plot
Returns:
None
"""
stat_str = '\n'.join(stat_to_string(name, value, nice_names=nice_names())
for name, value in stats.items())
fig.text(x_align, y_align, stat_str,
verticalalignment='top', wrap=True, fontdict=font_dict, fontproperties=FontProperties(size=fontsize))
[docs]def make_fig_ax(aspect_ratio=0.5, x_align=0.65, left=0.10):
"""
Method to make matplotlib figure and axes objects. Using Object Oriented interface from https://matplotlib.org/gallery/api/agg_oo_sgskip.html
Args:
aspect_ratio: (float), aspect ratio for figure and axes creation
x_align: (float), x position to draw edge of figure. Needed so can display stats alongside plot
left: (float), the leftmost position to draw edge of figure
Returns:
fig: (matplotlib fig object), a matplotlib figure object with the specified aspect ratio
ax: (matplotlib ax object), a matplotlib axes object with the specified aspect ratio
"""
# Set image aspect ratio:
w, h = figaspect(aspect_ratio)
fig = plt.figure(figsize=(w, h))
#fig = Figure(figsize=(w, h))
FigureCanvas(fig)
# Set custom positioning, see this guide for more details:
# https://python4astronomers.github.io/plotting/advanced.html
#left = 0.10
bottom = 0.15
right = 0.01
top = 0.05
width = x_align - left - right
height = 1 - bottom - top
ax = fig.add_axes((left, bottom, width, height), frameon=True)
fig.set_tight_layout(False)
return fig, ax
[docs]def make_fig_ax_square(aspect='equal', aspect_ratio=1):
"""
Method to make square shaped matplotlib figure and axes objects. Using Object Oriented interface from
https://matplotlib.org/gallery/api/agg_oo_sgskip.html
Args:
aspect: (str), 'equal' denotes x and y aspect will be equal (i.e. square)
aspect_ratio: (float), aspect ratio for figure and axes creation
Returns:
fig: (matplotlib fig object), a matplotlib figure object with the specified aspect ratio
ax: (matplotlib ax object), a matplotlib axes object with the specified aspect ratio
"""
# Set image aspect ratio:
w, h = figaspect(aspect_ratio)
fig = Figure(figsize=(w, h))
FigureCanvas(fig)
ax = fig.add_subplot(111, aspect=aspect)
return fig, ax
[docs]def make_axis_same(ax, max1, min1):
"""
Method to make the x and y ticks for each axis the same. Useful for parity plots
Args:
ax: (matplotlib axis object), a matplotlib axes object
max1: (float), the maximum value of a particular axis
min1: (float), the minimum value of a particular axis
Returns:
None
"""
if max1 - min1 > 5:
step = (int(max1) - int(min1)) // 3
ticks = range(int(min1), int(max1)+step, step)
else:
ticks = np.linspace(min1, max1, 5)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
[docs]def nice_mean(ls):
"""
Method to return mean of a list or equivalent array with NaN values
Args:
ls: (list), list of values
Returns:
(numpy array), array containing mean of list of values or NaN if list has no values
"""
if len(ls) > 0:
return np.mean(ls)
return np.nan
[docs]def nice_std(ls):
"""
Method to return standard deviation of a list or equivalent array with NaN values
Args:
ls: (list), list of values
Returns:
(numpy array), array containing standard deviation of list of values or NaN if list has no values
"""
if len(ls) > 0:
return np.std(ls)
return np.nan
[docs]def round_down(num, divisor):
"""
Method to return a rounded down number
Args:
num: (float), a number to round down
divisor: (int), divisor to denote how to round down
Returns:
(float), the rounded-down number
"""
return num - (num % divisor)
[docs]def round_up(num, divisor):
"""
Method to return a rounded up number
Args:
num: (float), a number to round up
divisor: (int), divisor to denote how to round up
Returns:
(float), the rounded-up number
"""
return float(math.ceil(num / divisor)) * divisor
[docs]def get_divisor(high, low):
"""
Method to obtain a sensible divisor based on range of two values
Args:
high: (float), a max data value
low: (float), a min data value
Returns:
divisor: (float), a number used to make sensible axis ticks
"""
delta = high-low
divisor = 10
if delta > 1000:
divisor = 100
if delta < 1000:
if delta > 100:
divisor = 10
if delta < 100:
if delta > 10:
divisor = 1
if delta < 10:
if delta > 1:
divisor = 0.1
if delta < 1:
if delta > 0.01:
divisor = 0.001
else:
divisor = 0.001
return divisor
[docs]def recursive_max(arr):
"""
Method to recursively find the max value of an array of iterables.
Credit: https://www.linkedin.com/pulse/ask-recursion-during-coding-interviews-identify-good-talent-veteanu/
Args:
arr: (numpy array), an array of values or iterables
Returns:
(float), max value in arr
"""
return max(
recursive_max(e) if isinstance(e, Iterable) else e
for e in arr
)
[docs]def recursive_min(arr):
"""
Method to recursively find the min value of an array of iterables.
Credit: https://www.linkedin.com/pulse/ask-recursion-during-coding-interviews-identify-good-talent-veteanu/
Args:
arr: (numpy array), an array of values or iterables
Returns:
(float), min value in arr
"""
return min(
recursive_min(e) if isinstance(e, Iterable) else e
for e in arr
)
[docs]def recursive_max_and_min(arr):
"""
Method to recursively return max and min of values or iterables in array
Args:
arr: (numpy array), an array of values or iterables
Returns:
(tuple), tuple containing max and min of arr
"""
return recursive_max(arr), recursive_min(arr)
def _set_tick_labels(ax, maxx, minn):
"""
Method that sets the x and y ticks to be in the same range
Args:
ax: (matplotlib axes object), a matplotlib axes object
maxx: (float), a maximum value
minn: (float), a minimum value
Returns:
None
"""
_set_tick_labels_different(ax, maxx, minn, maxx, minn) # I love it when this happens
def _set_tick_labels_different(ax, max_tick_x, min_tick_x, max_tick_y, min_tick_y):
"""
Method that sets the x and y ticks, when the axes have different ranges
Args:
ax: (matplotlib axes object), a matplotlib axes object
max_tick_x: (float), the maximum tick value for the x axis
min_tick_x: (float), the minimum tick value for the x axis
max_tick_y: (float), the maximum tick value for the y axis
min_tick_y: (float), the minimum tick value for the y axis
Returns:
None
"""
tickvals_x = nice_range(min_tick_x, max_tick_x)
tickvals_y = nice_range(min_tick_y, max_tick_y)
if tickvals_x[-1]-tickvals_x[len(tickvals_x)-2] < tickvals_x[len(tickvals_x)-3]-tickvals_x[len(tickvals_x)-4]:
tickvals_x = tickvals_x[:-1]
if tickvals_y[-1]-tickvals_y[len(tickvals_y)-2] < tickvals_y[len(tickvals_y)-3]-tickvals_y[len(tickvals_y)-4]:
tickvals_y = tickvals_y[:-1]
#tickvals_x = _clean_tick_labels(tickvals=tickvals_x, delta=max_tick_x-min_tick_x)
#tickvals_y = _clean_tick_labels(tickvals=tickvals_y, delta=max_tick_y - min_tick_y)
ax.set_xticks(ticks=tickvals_x)
ax.set_yticks(ticks=tickvals_y)
ticklabels_x = [str(tick) for tick in tickvals_x]
ticklabels_y = [str(tick) for tick in tickvals_y]
rotation = 0
# Look at length of x tick labels to see if may be possibly crowded. If so, rotate labels
tick_length = len(str(tickvals_x[1]))
if tick_length >= 4:
rotation = 45
ax.set_xticklabels(labels=ticklabels_x, fontsize=14, rotation=rotation)
ax.set_yticklabels(labels=ticklabels_y, fontsize=14)
def _clean_tick_labels(tickvals, delta):
"""
Method to attempt to clean up axis tick values so they don't overlap from being too dense
Args:
tickvals: (list), a list containing the initial axis tick values
delta: (float), number representing the numerical difference of two ticks
Returns:
tickvals_clean: (list), a list containing the updated axis tick values
"""
tickvals_clean = list()
if delta >= 100:
for i, val in enumerate(tickvals):
if i <= len(tickvals)-1:
if tickvals[i]-tickvals[i-1] >= 100:
tickvals_clean.append(val)
else:
tickvals_clean = tickvals
return tickvals_clean
# Math utilities to aid plot_helper to make ranges
[docs]def nice_range(lower, upper):
"""
Method to create a range of values, including the specified start and end points, with nicely spaced intervals
Args:
lower: (float or int), lower bound of range to create
upper: (float or int), upper bound of range to create
Returns:
(list), list of numerical values in established range
"""
flipped = 1 # set to -1 for inverted
# Case for validation where nan is passed in
if np.isnan(lower):
lower = 0
if np.isnan(upper):
upper = 0.1
if upper < lower:
upper, lower = lower, upper
flipped = -1
return [_int_if_int(x) for x in _nice_range_helper(lower, upper)][::flipped]
def _nice_range_helper(lower, upper):
"""
Method to help make a better range of axis ticks
Args:
lower: (float), lower value of axis ticks
upper: (float), upper value of axis ticks
Returns:
upper: (float), modified upper tick value fixed based on set of axis ticks
"""
steps = 8
diff = abs(lower - upper)
# special case where lower and upper are the same
if diff == 0:
return [lower, ]
# the exact step needed
step = diff / steps
# a rough estimate of best step
step = _nearest_pow_ten(step) # whole decimal increments
# tune in one the best step size
factors = [0.1, 0.2, 0.5, 1, 2, 5, 10]
# use this to minimize how far we are from ideal step size
def best_one(steps_factor):
steps_count, factor = steps_factor
return abs(steps_count - steps)
n_steps, best_factor = min([(diff / (step * f), f) for f in factors], key=best_one)
#print('should see n steps', ceil(n_steps + 2))
# multiply in the optimal factor for getting as close to ten steps as we can
step = step * best_factor
# make the bounds look nice
lower = _three_sigfigs(lower)
upper = _three_sigfigs(upper)
start = _round_up(lower, step)
# prepare for iteration
x = start # pointless init
i = 0
# itereate until we reach upper
while x < upper - step:
x = start + i * step
yield _three_sigfigs(x) # using sigfigs because of floating point error
i += 1
# finish off with ending bound
yield upper
def _three_sigfigs(x):
"""
Method invoking special case of _n_sigfigs to return 3 sig figs
Args:
x: (float), an axis tick number
Returns:
(float), number of sig figs (always 3)
"""
return _n_sigfigs(x, 3)
def _n_sigfigs(x, n):
"""
Method to return number of sig figs to use for axis ticks
Args:
x: (float), an axis tick number
Returns:
(float), number of sig figs
"""
sign = 1
if x == 0:
return 0
if x < 0: # case for negatives
x = -x
sign = -1
if x < 1:
base = n - round(log(x, 10))
else:
base = (n-1) - round(log(x, 10))
return sign * round(x, base)
def _nearest_pow_ten(x):
"""
Method to return the nearest power of ten for an axis tick value
Args:
x: (float), an axis tick number
Returns:
(float), nearest power of ten of x
"""
sign = 1
if x == 0:
return 0
if x < 0: # case for negatives
x = -x
sign = -1
return sign*10**ceil(log(x, 10))
def _int_if_int(x):
"""
Method to return integer mapped value of x
Args:
x: (float or int), a number
Returns:
x: (float), value of x mapped as integer
"""
if int(x) == x:
return int(x)
return x
def _round_up(x, inc):
"""
Method to round up the value of x
Args:
x: (float or int), a number
inc: (float), an increment for axis ticks
Returns:
(float), value of x rounded up
"""
sign = 1
if x < 0: # case for negative
x = -x
sign = -1
return sign * inc * ceil(x / inc)
[docs]def nice_names():
nice_names = {
# classification:
'accuracy': 'Accuracy',
'f1_binary': '$F_1$',
'f1_macro': 'f1_macro',
'f1_micro': 'f1_micro',
'f1_samples': 'f1_samples',
'f1_weighted': 'f1_weighted',
'log_loss': 'log_loss',
'precision_binary': 'Precision',
'precision_macro': 'prec_macro',
'precision_micro': 'prec_micro',
'precision_samples': 'prec_samples',
'precision_weighted': 'prec_weighted',
'recall_binary': 'Recall',
'recall_macro': 'rcl_macro',
'recall_micro': 'rcl_micro',
'recall_samples': 'rcl_samples',
'recall_weighted': 'rcl_weighted',
'roc_auc': 'ROC_AUC',
# regression:
'explained_variance': 'expl_var',
'mean_absolute_error': 'MAE',
'mean_squared_error': 'MSE',
'mean_squared_log_error': 'MSLE',
'median_absolute_error': 'MedAE',
'root_mean_squared_error': 'RMSE',
'rmse_over_stdev': r'RMSE/$\sigma_y$',
'r2_score': '$R^2$',
'r2_score_noint': '$R^2_{noint}$',
'r2_score_adjusted': '$R^2_{adjusted}$',
'r2_score_fitted': '$R^2_{fitted}$'
}
return nice_names