Source code for mastml.plots

"""
This module contains classes used for generating different types of analysis plots

Scatter:
    This class contains a variety of scatter plot types, e.g. parity (predicted vs. true) plots

Error:
    This class contains plotting methods used to better quantify the model errors and uncertainty quantification.

Histogram:
    This class contains methods for constructing histograms of data distributions and visualization of model residuals.

Line:
    This class contains methods for making line plots, e.g. for constructing learning curves of model performance vs.
    amount of data or number of features.

"""

import warnings
import math
import os
import pandas as pd
import numpy as np
try:
    from collections.abc import Iterable
except ImportError:
    from collections import Iterable
from math import log, ceil
import scipy
from scipy.stats import gaussian_kde, norm
import scipy.stats as stats

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.exceptions import NotFittedError

from mastml.metrics import Metrics
from mastml.error_analysis import ErrorUtils

import matplotlib
from matplotlib import pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from matplotlib.figure import Figure, figaspect
from matplotlib.font_manager import FontProperties
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1 import make_axes_locatable

try:
    import statsmodels.api as sm
except:
    print('statsmodels is an optional dependency. If you want to create QQ plots for error analysis, do pip install statsmodels')

matplotlib.rc('font', size=18, family='sans-serif')  # set all font to bigger
matplotlib.rc('figure', autolayout=True)  # turn on autolayout

warnings.filterwarnings(action="ignore")

[docs]class Scatter():
    """
    Class to generate scatter plots, such as parity plots showing true vs. predicted data values

    Args:
        None

    Methods:

        plot_predicted_vs_true: method to plot a parity plot
            Args:
                y_true: (pd.Series), series of true y data

                y_pred: (pd.Series), series of predicted y data

                savepath: (str), string denoting the save path for the figure image

                data_type: (str), string denoting the data type (e.g. train, test, leaveout)

                x_label: (str), string denoting the true and predicted property name

                metrics_list: (list), list of strings of metric names to evaluate and include on the figure

                show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)

            Returns:
                None

        plot_best_worst_split: method to find the best and worst split in an evaluation set and plot them together
            Args:
                savepath: (str), string denoting the save path for the figure image

                data_type: (str), string denoting the data type (e.g. train, test, leaveout)

                x_label: (str), string denoting the true and predicted property name

                metrics_list: (list), list of strings of metric names to evaluate and include on the figure

                show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)

            Returns:
                None

        plot_best_worst_per_point: method to find all of the best and worst data points from an evaluation set and plot them together
            Args:
                savepath: (str), string denoting the save path for the figure image

                data_type: (str), string denoting the data type (e.g. train, test, leaveout)

                x_label: (str), string denoting the true and predicted property name

                metrics_list: (list), list of strings of metric names to evaluate and include on the figure

                show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)

            Returns:
                None

        plot_predicted_vs_true_bars: method to plot the average predicted value of each data point from an evaluation set with error bars denoting the standard deviation in predicted values
            Args:
                savepath: (str), string denoting the save path for the figure image

                data_type: (str), string denoting the data type (e.g. train, test, leaveout)

                x_label: (str), string denoting the true and predicted property name

                metrics_list: (list), list of strings of metric names to evaluate and include on the figure

                show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)

            Returns:
                None

        plot_metric_vs_group: method to plot the metric value for each group during e.g. a LeaveOneGroupOut data split
            Args:
                savepath: (str), string denoting the save path for the figure image

                data_type: (str), string denoting the data type (e.g. train, test, leaveout)

                show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)

            Returns:
                 None

    """
[docs]    @classmethod
    def plot_predicted_vs_true(cls, y_true, y_pred, savepath, data_type, x_label, metrics_list=None, show_figure=False,
                               ebars=None, file_extension='.csv', image_dpi=250, groups=None):

        # Make the dataframe/array 1D if it isn't
        y_true = check_dimensions(y_true)
        y_pred = check_dimensions(y_pred)

        # Set image aspect ratio:
        fig, ax = make_fig_ax()

        # gather max and min
        maxx = max(np.nanmax(y_true), np.nanmax(y_pred))
        minn = min(np.nanmin(y_true), np.nanmin(y_pred))

        _set_tick_labels(ax, maxx, minn)

        if groups is not None:
            groups_unique = np.unique(groups)
            colors = ['blue', 'green', 'red', 'purple', 'orange', 'black', 'grey']
            shapes = ['o', 's', '^', 'x', '<', '>', 'h']
            count = 0
            lap = 0
            for group in groups_unique:
                ax.scatter(np.array(y_true)[np.where(groups==group)], np.array(y_pred)[np.where(groups==group)], c=colors[count],
                           marker=shapes[lap], zorder=2, s=100, alpha=0.7, label=group)
                if count < len(colors)-1:
                    count += 1
                else:
                    lap += 1
                    count = 0
            ax.legend(loc='best', fontsize=8)

        else:
            ax.scatter(y_true, y_pred, c='b', edgecolor='darkblue', zorder=2, s=100, alpha=0.7)

        if ebars is not None:
            if groups is not None:
                groups_unique = np.unique(groups)
                colors = ['blue', 'green', 'red', 'purple', 'orange', 'black', 'grey']
                shapes = ['o', 's', '^', 'x', '<', '>', 'h']
                count = 0
                lap = 0
                for group in groups_unique:
                    ax.errorbar(np.array(y_true)[np.where(groups == group)], np.array(y_pred)[np.where(groups == group)],
                                yerr=np.array(ebars)[np.where(groups == group)], fmt=shapes[lap], markerfacecolor=colors[count],
                                markeredgecolor=colors[count], ecolor=colors[count], markersize=10, alpha=0.7, capsize=3)
                    if count < len(colors) - 1:
                        count += 1
                    else:
                        lap += 1
                        count = 0
                ax.legend(loc='best', fontsize=8)
            else:
                ax.errorbar(y_true, y_pred, yerr=ebars, fmt='o',
                        markerfacecolor='blue', markeredgecolor='black', ecolor='blue',
                        markersize=10, alpha=0.7, capsize=3)

        # draw dashed horizontal line
        ax.plot([minn, maxx], [minn, maxx], 'k--', lw=2, zorder=1)

        ax.set_xlabel('True ' + x_label, fontsize=14)
        ax.set_ylabel('Predicted ' + x_label, fontsize=14)

        if metrics_list is None:
            # Use some default metric set
            metrics_list = ['r2_score', 'mean_absolute_error', 'root_mean_squared_error', 'rmse_over_stdev']
        stats_dict = Metrics(metrics_list=metrics_list).evaluate(y_true=y_true, y_pred=y_pred)

        if groups is not None:
            stats_dict_group = dict()
            for group in groups_unique:
                stats_dict_group[group] = Metrics(metrics_list=metrics_list).evaluate(y_true=np.array(y_true)[np.where(groups==group)],
                                                                                      y_pred=np.array(y_pred)[np.where(groups==group)])
            stats_dict_group['Overall'] = Metrics(metrics_list=metrics_list).evaluate(y_true=y_true, y_pred=y_pred)
            stats_group_df = pd.DataFrame().from_dict(stats_dict_group, orient='index', columns=metrics_list)
            if file_extension == '.xlsx':
                stats_group_df.to_excel(os.path.join(savepath, str(data_type)+ '_stats_pergroup_summary' +'.xlsx'))
            elif file_extension == '.csv':
                stats_group_df.to_csv(os.path.join(savepath, str(data_type)+'_stats_pergroup_summary' +'.csv'))

            cls.plot_metric_vs_group(groups_unique, stats_group_df, metrics_list, savepath, data_type, show_figure, file_extension, image_dpi)

        plot_stats(fig, stats_dict, x_align=0.65, y_align=0.90, fontsize=12)
        if ebars is not None:
            if groups is not None:
                fig.savefig(os.path.join(savepath, 'parity_plot_withcalibratederrorbars_grouplabels_' + str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')
            else:
                fig.savefig(os.path.join(savepath, 'parity_plot_withcalibratederrorbars_' + str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')
            df = pd.DataFrame({'y true': y_true,
                               'y pred': y_pred,
                               'y err': ebars})
            if file_extension == '.xlsx':
                df.to_excel(os.path.join(savepath, 'parity_plot_withcalibratederrorbars_' + str(data_type) + '.xlsx'), index=False)
            elif file_extension == '.csv':
                df.to_csv(os.path.join(savepath, 'parity_plot_withcalibratederrorbars_' + str(data_type) + '.csv'), index=False)
        else:
            if groups is not None:
                fig.savefig(os.path.join(savepath, 'parity_plot_grouplabels_' + str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')
            else:
                fig.savefig(os.path.join(savepath, 'parity_plot_'+str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')
            df = pd.DataFrame({'y true': y_true,
                               'y pred': y_pred})
            if file_extension == '.xlsx':
                df.to_excel(os.path.join(savepath, 'parity_plot_' + str(data_type) + '.xlsx'), index=False)
            elif file_extension == '.csv':
                df.to_csv(os.path.join(savepath, 'parity_plot_' + str(data_type) + '.csv'), index=False)

        if show_figure == True:
            plt.show()
        else:
            plt.close()
        return

[docs]    @classmethod
    def plot_best_worst_split(cls, savepath, data_type, x_label, metrics_list, show_figure=False, file_extension='.csv', image_dpi=250):

        dirs = os.listdir(savepath)
        splitdirs = [d for d in dirs if 'split_' in d and '.png' not in d]

        stats_files_dict = dict()
        for splitdir in splitdirs:
            if file_extension == '.xlsx':
                stats_files_dict[splitdir] = pd.read_excel(os.path.join(os.path.join(savepath, splitdir), data_type + '_stats_summary.xlsx'), engine='openpyxl').to_dict('records')[0]
            elif file_extension == '.csv':
                stats_files_dict[splitdir] = \
                pd.read_csv(os.path.join(os.path.join(savepath, splitdir), data_type + '_stats_summary.csv')).to_dict('records')[0]

        # Find best/worst splits based on RMSE value
        rmse_best = 10**20
        rmse_worst = 0
        for split, stats_dict in stats_files_dict.items():
            if stats_dict['root_mean_squared_error'] < rmse_best:
                best_split = split
                rmse_best = stats_dict['root_mean_squared_error']
            if stats_dict['root_mean_squared_error'] > rmse_worst:
                worst_split = split
                rmse_worst = stats_dict['root_mean_squared_error']
        if file_extension == '.xlsx':
            if data_type == 'test':
                y_true_best = pd.read_excel(os.path.join(os.path.join(savepath, best_split), 'y_test.xlsx'), engine='openpyxl')
                y_pred_best = pd.read_excel(os.path.join(os.path.join(savepath, best_split), 'y_pred.xlsx'), engine='openpyxl')
                y_true_worst = pd.read_excel(os.path.join(os.path.join(savepath, worst_split), 'y_test.xlsx'), engine='openpyxl')
                y_pred_worst = pd.read_excel(os.path.join(os.path.join(savepath, worst_split), 'y_pred.xlsx'), engine='openpyxl')
            elif data_type == 'train':
                y_true_best = pd.read_excel(os.path.join(os.path.join(savepath, best_split), 'y_train.xlsx'), engine='openpyxl')
                y_pred_best = pd.read_excel(os.path.join(os.path.join(savepath, best_split), 'y_pred_train.xlsx'), engine='openpyxl')
                y_true_worst = pd.read_excel(os.path.join(os.path.join(savepath, worst_split), 'y_train.xlsx'), engine='openpyxl')
                y_pred_worst = pd.read_excel(os.path.join(os.path.join(savepath, worst_split), 'y_pred_train.xlsx'), engine='openpyxl')
        elif file_extension == '.csv':
            if data_type == 'test':
                y_true_best = pd.read_csv(os.path.join(os.path.join(savepath, best_split), 'y_test.csv'))
                y_pred_best = pd.read_csv(os.path.join(os.path.join(savepath, best_split), 'y_pred.csv'))
                y_true_worst = pd.read_csv(os.path.join(os.path.join(savepath, worst_split), 'y_test.csv'))
                y_pred_worst = pd.read_csv(os.path.join(os.path.join(savepath, worst_split), 'y_pred.csv'))
            elif data_type == 'train':
                y_true_best = pd.read_csv(os.path.join(os.path.join(savepath, best_split), 'y_train.csv'))
                y_pred_best = pd.read_csv(os.path.join(os.path.join(savepath, best_split), 'y_pred_train.csv'))
                y_true_worst = pd.read_csv(os.path.join(os.path.join(savepath, worst_split), 'y_train.csv'))
                y_pred_worst = pd.read_csv(os.path.join(os.path.join(savepath, worst_split), 'y_pred_train.csv'))


        # Make the dataframe/array 1D if it isn't
        y_true_best = check_dimensions(y_true_best)
        y_pred_best = check_dimensions(y_pred_best)
        y_true_worst = check_dimensions(y_true_worst)
        y_pred_worst = check_dimensions(y_pred_worst)

        # Set image aspect ratio:
        fig, ax = make_fig_ax()

        # gather max and min
        maxx = max(np.nanmax(y_true_best), np.nanmax(y_pred_best), np.nanmax(y_true_worst), np.nanmax(y_pred_worst))
        minn = min(np.nanmin(y_true_best), np.nanmin(y_pred_best), np.nanmin(y_true_worst), np.nanmin(y_pred_worst))

        _set_tick_labels(ax, maxx, minn)

        ax.scatter(y_true_best, y_pred_best, c='b', edgecolor='darkblue', zorder=2, s=100, alpha=0.7, label='Best split')
        ax.scatter(y_true_worst, y_pred_worst, c='r', edgecolor='darkred', zorder=2, s=100, alpha=0.7, label='Worst split')

        ax.legend(loc='best')

        # draw dashed horizontal line
        ax.plot([minn, maxx], [minn, maxx], 'k--', lw=2, zorder=1)

        ax.set_xlabel('True ' + x_label, fontsize=14)
        ax.set_ylabel('Predicted ' + x_label, fontsize=14)

        stats_dict_best = Metrics(metrics_list=metrics_list).evaluate(y_true=y_true_best, y_pred=y_pred_best)
        stats_dict_worst = Metrics(metrics_list=metrics_list).evaluate(y_true=y_true_worst, y_pred=y_pred_worst)

        plot_stats(fig, stats_dict_best, x_align=0.65, y_align=0.90, font_dict={'fontsize': 12, 'color': 'blue'})
        plot_stats(fig, stats_dict_worst, x_align=0.65, y_align=0.50, font_dict={'fontsize': 12, 'color': 'red'})

        # Save data to excel file and image
        fig.savefig(os.path.join(savepath, 'parity_plot_best_worst_split_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')
        if show_figure == True:
            plt.show()
        else:
            plt.close()
        return

[docs]    @classmethod
    def plot_best_worst_per_point(cls, savepath, data_type, x_label, metrics_list, show_figure=False, file_extension='.csv', image_dpi=250):

        # Get lists of all ytrue and ypred for each split
        dirs = os.listdir(savepath)
        splitdirs = [d for d in dirs if 'split_' in d and '.png' not in d]

        y_true_list = list()
        y_pred_list = list()
        index_list = list()
        if file_extension == '.xlsx':
            for splitdir in splitdirs:
                y_true_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_'+str(data_type)+'.xlsx'), engine='openpyxl'))
                if data_type == 'test':
                    y_pred_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_pred.xlsx'), engine='openpyxl'))
                    index_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'test_inds.xlsx'), engine='openpyxl'))
                elif data_type == 'train':
                    y_pred_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_pred_train.xlsx'), engine='openpyxl'))
                    index_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'train_inds.xlsx'), engine='openpyxl'))
        elif file_extension == '.csv':
            for splitdir in splitdirs:
                y_true_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_'+str(data_type)+'.csv')))
                if data_type == 'test':
                    y_pred_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_pred.csv')))
                    index_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'test_inds.csv')))
                elif data_type == 'train':
                    y_pred_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_pred_train.csv')))
                    index_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'train_inds.csv')))

        all_y_true = list()
        all_y_pred = list()
        all_abs_residuals = list()
        all_indices = list()
        for yt, y_pred, indices in zip(y_true_list, y_pred_list, index_list):
            yt = np.array(check_dimensions(yt))
            y_pred = np.array(check_dimensions(y_pred))
            indices = np.array(check_dimensions(indices))
            abs_residuals = abs(yt-y_pred)
            all_y_true.append(yt)
            all_y_pred.append(y_pred)
            all_abs_residuals.append(abs_residuals)
            all_indices.append(indices)
        all_y_true_flat = np.array([item for sublist in all_y_true for item in sublist])
        all_y_pred_flat = np.array([item for sublist in all_y_pred for item in sublist])
        all_indices_flat = np.array([item for sublist in all_indices for item in sublist])
        all_residuals_flat = np.array([item for sublist in all_abs_residuals for item in sublist])

        # Loop over indices
        unique_inds = np.unique(all_indices_flat)
        bests = list()
        worsts = list()
        y_true_unique = list()
        for ind in unique_inds:
            y_true_unique.append(np.mean(all_y_true_flat[np.where(all_indices_flat==ind)[0]]))
            best = min(abs(all_y_pred_flat[np.where(all_indices_flat==ind)] - all_y_true_flat[np.where(all_indices_flat==ind)]))
            worst = max(abs(all_y_pred_flat[np.where(all_indices_flat==ind)] - all_y_true_flat[np.where(all_indices_flat==ind)]))
            bests.append(all_y_pred_flat[np.where(all_residuals_flat == best)])
            worsts.append(all_y_pred_flat[np.where(all_residuals_flat == worst)])

        y_true_unique = np.array(y_true_unique).ravel()
        bests = np.array(bests).ravel()
        worsts = np.array(worsts).ravel()

        stats_dict_best = Metrics(metrics_list=metrics_list).evaluate(y_true=y_true_unique, y_pred=bests)
        stats_dict_worst = Metrics(metrics_list=metrics_list).evaluate(y_true=y_true_unique, y_pred=worsts)

        fig, ax = make_fig_ax(x_align=0.65)

        # gather max and min
        maxx = float(max([max(y_true_unique), max(bests), max(worsts)]))
        minn = float(min([min(y_true_unique), min(bests), min(worsts)]))

        # draw dashed horizontal line
        ax.plot([minn, maxx], [minn, maxx], 'k--', lw=2, zorder=1)

        # set axis labels
        ax.set_xlabel('True '+x_label, fontsize=16)
        ax.set_ylabel('Predicted '+x_label, fontsize=16)

        # set tick labels
        #maxx = round(float(max1), rounder(max1-min1))
        #minn = round(float(min1), rounder(max1-min1))
        _set_tick_labels(ax, maxx, minn)

        ax.scatter(y_true_unique, bests,  c='b',  alpha=0.7, label='best all points', edgecolor='darkblue', zorder=2, s=100)
        ax.scatter(y_true_unique, worsts, c='r', alpha=0.7, label='worst all points', edgecolor='darkred', zorder=2, s=70)
        ax.legend(loc='best', fontsize=12)

        #plot_stats(fig, avg_stats, x_align=x_align, y_align=0.51, fontsize=10)
        plot_stats(fig, stats_dict_best, x_align=0.65, y_align=0.90, font_dict={'fontsize': 10, 'color': 'b'})
        plot_stats(fig, stats_dict_worst, x_align=0.65, y_align=0.50, font_dict={'fontsize': 10, 'color': 'r'})

        # Save data to excel file and image
        fig.savefig(os.path.join(savepath, 'parity_plot_best_worst_eachpoint_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')
        if show_figure == True:
            plt.show()
        else:
            plt.close()
        return

[docs]    @classmethod
    def plot_predicted_vs_true_bars(cls, savepath, x_label, data_type, metrics_list, show_figure=False, ebars=None,
                                    file_extension='.csv', image_dpi=250, groups=None):

        # Get lists of all ytrue and ypred for each split
        dirs = os.listdir(savepath)
        splitdirs = [d for d in dirs if 'split_' in d and '.png' not in d]

        y_true_list = list()
        y_pred_list = list()
        data_ind_list = list()
        groups_list = list()
        if file_extension == '.xlsx':
            for splitdir in splitdirs:
                y_true_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_'+str(data_type)+'.xlsx'), engine='openpyxl'))
                if data_type == 'test':
                    y_pred_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_pred.xlsx'), engine='openpyxl'))
                    data_ind_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'test_inds.xlsx'), engine='openpyxl'))
                    if groups is not None:
                        groups_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'test_groups.xlsx'), engine='openpyxl'))
                elif data_type == 'train':
                    y_pred_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_pred_train.xlsx'), engine='openpyxl'))
                    data_ind_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'train_inds.xlsx'), engine='openpyxl'))
                    if groups is not None:
                        groups_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'train_groups.xlsx'), engine='openpyxl'))
                elif data_type == 'leaveout':
                    y_pred_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'y_pred_leaveout.xlsx'), engine='openpyxl'))
                    data_ind_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'leaveout_inds.xlsx'), engine='openpyxl'))
                    if groups is not None:
                        groups_list.append(pd.read_excel(os.path.join(os.path.join(savepath, splitdir), 'leaveout_groups.xlsx'), engine='openpyxl'))
        elif file_extension == '.csv':
            for splitdir in splitdirs:
                y_true_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_' + str(data_type) + '.csv')))
                if data_type == 'test':
                    y_pred_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_pred.csv')))
                    data_ind_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'test_inds.csv')))
                    if groups is not None:
                        groups_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'test_groups.csv')))
                elif data_type == 'train':
                    y_pred_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_pred_train.csv')))
                    data_ind_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'train_inds.csv')))
                    if groups is not None:
                        groups_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'train_groups.csv')))
                elif data_type == 'leaveout':
                    y_pred_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'y_pred_leaveout.csv')))
                    data_ind_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'leaveout_inds.csv')))
                    if groups is not None:
                        groups_list.append(pd.read_csv(os.path.join(os.path.join(savepath, splitdir), 'leaveout_groups.csv')))

        all_y_true = list()
        all_y_pred = list()
        all_data_inds = list()
        all_groups = list()
        if groups is not None:
            for groups in groups_list:
                all_groups.append(np.array(groups))
        for yt, y_pred, data_inds in zip(y_true_list, y_pred_list, data_ind_list):
            yt = np.array(check_dimensions(yt))
            y_pred = np.array(check_dimensions(y_pred))
            data_inds = np.array(check_dimensions(data_inds))
            all_y_true.append(yt)
            all_y_pred.append(y_pred)
            all_data_inds.append(data_inds)

        df_all = pd.DataFrame({'all_y_true': np.array([item for sublist in all_y_true for item in sublist]),
                               'all_y_pred': np.array([item for sublist in all_y_pred for item in sublist]),
                               'all_data_inds': np.array([item for sublist in all_data_inds for item in sublist])})
        if groups is not None:
            df_all['all_groups']= np.array([item for sublist in all_groups for item in sublist])

        if ebars is not None:
            df_all['ebars'] = np.array(ebars)

        df_all_grouped = df_all.groupby('all_data_inds', as_index=False, sort=False)
        df_avg = df_all_grouped.mean()
        df_std = df_all_grouped.std()

        # make fig and ax, use x_align when placing text so things don't overlap
        x_align = 0.64
        fig, ax = make_fig_ax(x_align=x_align)

        trues = df_avg['all_y_true']
        preds = df_avg['all_y_pred']
        if groups is not None:
            # Need to use the indices of df.groupby to get the original list of groups
            grps = df_all_grouped.groups
            inds = list()
            for k, v, in grps.items():
                inds.append(v[0])
            groups = np.array(df_all['all_groups'])[inds]

        # gather max and min
        maxx = max(np.nanmax(trues), np.nanmax(preds))
        minn = min(np.nanmin(trues), np.nanmin(preds))

        # draw dashed horizontal line
        ax.plot([minn, maxx], [minn, maxx], 'k--', lw=2, zorder=1)

        # set axis labels
        ax.set_xlabel('True ' + x_label, fontsize=16)
        ax.set_ylabel('Predicted ' + x_label, fontsize=16)

        # set tick labels
        _set_tick_labels(ax, maxx, minn)

        if groups is not None:
            groups_unique = np.unique(groups)
            colors = ['blue', 'green', 'red', 'purple', 'orange', 'black', 'grey']
            shapes = ['o', 's', '^', 'x', '<', '>', 'h']
            count = 0
            lap = 0
            for group in groups_unique:
                ax.scatter(np.array(trues)[np.where(groups==group)], np.array(preds)[np.where(groups==group)], c=colors[count],
                           marker=shapes[lap], zorder=2, s=100, alpha=0.7, label=group)
                if count < len(colors)-1:
                    count += 1
                else:
                    lap += 1
                    count = 0
            ax.legend(loc='best', fontsize=8)
        else:
            ax.scatter(trues, preds, c='b', edgecolor='darkblue', zorder=2, s=100, alpha=0.7)

        if ebars is not None:
            if groups is not None:
                groups_unique = np.unique(groups)
                colors = ['blue', 'green', 'red', 'purple', 'orange', 'black', 'grey']
                shapes = ['o', 's', '^', 'x', '<', '>', 'h']
                count = 0
                lap = 0
                for group in groups_unique:
                    ax.errorbar(np.array(trues)[np.where(groups == group)], np.array(preds)[np.where(groups == group)],
                                yerr=np.array(df_avg['ebars'])[np.where(groups == group)], fmt=shapes[lap], markerfacecolor=colors[count],
                                markeredgecolor=colors[count], ecolor=colors[count], markersize=10, alpha=0.7, capsize=3)
                    if count < len(colors) - 1:
                        count += 1
                    else:
                        lap += 1
                        count = 0
                ax.legend(loc='best', fontsize=8)
            else:
                ax.errorbar(trues, preds, yerr=df_avg['ebars'], fmt='o',
                        markerfacecolor='blue', markeredgecolor='black', ecolor='blue',
                        markersize=10, alpha=0.7, capsize=3)
        else:
            if groups is not None:
                groups_unique = np.unique(groups)
                colors = ['blue', 'green', 'red', 'purple', 'orange', 'black', 'grey']
                shapes = ['o', 's', '^', 'x', '<', '>', 'h']
                count = 0
                lap = 0
                for group in groups_unique:
                    ax.errorbar(np.array(trues)[np.where(groups == group)], np.array(preds)[np.where(groups == group)],
                                yerr=np.array(df_std['all_y_pred'])[np.where(groups == group)], fmt=shapes[lap], markerfacecolor=colors[count],
                                markeredgecolor=colors[count], ecolor=colors[count], markersize=10, alpha=0.7, capsize=3)
                    if count < len(colors) - 1:
                        count += 1
                    else:
                        lap += 1
                        count = 0
                ax.legend(loc='best', fontsize=8)
            else:
                ax.errorbar(trues, preds, yerr=df_std['all_y_pred'], fmt='o',
                        markerfacecolor='blue', markeredgecolor='black', ecolor='blue',
                        markersize=10, alpha=0.7, capsize=3)

        stats_files_dict = dict()
        for splitdir in splitdirs:
            if file_extension == '.xlsx':
                stats_files_dict[splitdir] = pd.read_excel(os.path.join(os.path.join(savepath, splitdir), data_type + '_stats_summary.xlsx'), engine='openpyxl').to_dict('records')[0]
            elif file_extension == '.csv':
                stats_files_dict[splitdir] = pd.read_csv(os.path.join(os.path.join(savepath, splitdir), data_type + '_stats_summary.csv')).to_dict('records')[0]
            metrics_list = list(stats_files_dict[splitdir].keys())

        avg_stats = dict()
        for metric in metrics_list:
            stats = list()
            for splitdir in splitdirs:
                stats.append(stats_files_dict[splitdir][metric])

            avg_stats[metric] = (np.mean(stats), np.std(stats))

        plot_stats(fig, avg_stats, x_align=x_align, y_align=0.90)

        if ebars is not None:
            if groups is not None:
                fig.savefig(os.path.join(savepath, 'parity_plot_allsplits_average_withcalibratederrorbars_grouplabels_' + str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')
            else:
                fig.savefig(os.path.join(savepath, 'parity_plot_allsplits_average_withcalibratederrorbars_' + str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')

            df = pd.DataFrame({'y true': trues,
                               'average predicted values': preds,
                               'error bar values': df_avg['ebars']})
            if file_extension == '.xlsx':
                df.to_excel(os.path.join(savepath, 'parity_plot_allsplits_average_withcalibratederrorbars_' + str(data_type) + '.xlsx'), index=False)
            elif file_extension == '.csv':
                df.to_csv(os.path.join(savepath, 'parity_plot_allsplits_average_withcalibratederrorbars_' + str(data_type) + '.csv'), index=False)
        else:
            if groups is not None:
                fig.savefig(os.path.join(savepath, 'parity_plot_allsplits_average_grouplabels_' + str(data_type) + '.png'), dpi=image_dpi, bbox_inches='tight')
            else:
                fig.savefig(os.path.join(savepath, 'parity_plot_allsplits_average_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')

            df = pd.DataFrame({'y true': trues,
                               'average predicted values': preds,
                               'error bar values': df_std['all_y_pred']})
            if file_extension == '.xlsx':
                df.to_excel(os.path.join(savepath, 'parity_plot_allsplits_average_'+str(data_type)+'.xlsx'), index=False)
            elif file_extension == '.csv':
                df.to_csv(os.path.join(savepath, 'parity_plot_allsplits_average_' + str(data_type) + '.csv'), index=False)

        df_stats = pd.DataFrame().from_dict(avg_stats)
        if file_extension == '.xlsx':
            df_stats.to_excel(os.path.join(savepath, str(data_type)+'_average_stdev_stats_summary.xlsx'), index=False)
        elif file_extension == '.csv':
            df_stats.to_csv(os.path.join(savepath, str(data_type) + '_average_stdev_stats_summary.csv'), index=False)

        if show_figure == True:
            plt.show()
        else:
            plt.close()
        return

[docs]    @classmethod
    def plot_metric_vs_group(cls, groups, stats_group_df, metrics_list, savepath, data_type, show_figure, file_extension='.csv', image_dpi=250):

        for metric in metrics_list:
            stats = list()
            for group in groups:
                stats.append(stats_group_df[metric][group])

            avg_stats = {metric: (np.mean(stats), np.std(stats))}

            # make fig and ax, use x_align when placing text so things don't overlap
            x_align = 0.64
            fig, ax = make_fig_ax(x_align=x_align)

            # do the actual plotting
            ax.scatter(groups, stats, c='blue', alpha=0.7, edgecolor='darkblue', zorder=2, s=100)

            # set axis labels
            ax.set_xlabel('Group', fontsize=14)
            ax.set_ylabel(metric, fontsize=14)
            ax.set_xticklabels(labels=groups, fontsize=14)
            plot_stats(fig, avg_stats, x_align=x_align, y_align=0.90)

            fig.savefig(os.path.join(savepath, str(metric)+'_value_per_group_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')
            if show_figure == True:
                plt.show()
            else:
                plt.close()
        return


[docs]class Error():
    """
    Class to make plots related to model error assessment and uncertainty quantification

    Args:
        None

    Methods:
        plot_normalized_error: Method to plot the normalized residual errors of a model prediction
            Args:
                residuals: (pd.Series), series containing the true errors (model residuals)

                savepath: (str), string denoting the save path to save the figure to

                data_type: (str), string denoting the data type, e.g. train, test, leftout

                model_errors: (pd.Series), series containing the predicted model errors (optional, default None)

                show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)

            Returns:
                None

        plot_cumulative_normalized_error: Method to plot the cumulative normalized residual errors of a model prediction
            Args:
                residuals: (pd.Series), series containing the true errors (model residuals)

                savepath: (str), string denoting the save path to save the figure to

                data_type: (str), string denoting the data type, e.g. train, test, leftout

                model_errors: (pd.Series), series containing the predicted model errors (optional, default None)

                show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)


            Returns:
                None

        plot_rstat: Method for plotting the r-statistic distribution (true divided by predicted error)
            Args:
                savepath: (str), string denoting the save path to save the figure to

                data_type: (str), string denoting the data type, e.g. train, test, leftout

                residuals: (pd.Series), series containing the true errors (model residuals)

                model_errors: (pd.Series), series containing the predicted model errors

                show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)

                is_calibrated: (bool), whether or not the model errors have been recalibrated (default False)

            Returns:
                None

        plot_rstat_uncal_cal_overlay: Method for plotting the r-statistic distribution for two cases together: the as-obtained uncalibrated model errors and calibrated errors
            Args:
                savepath: (str), string denoting the save path to save the figure to

                data_type: (str), string denoting the data type, e.g. train, test, leftout

                residuals: (pd.Series), series containing the true errors (model residuals)

                model_errors: (pd.Series), series containing the predicted model errors

                model_errors_cal: (pd.Series), series containing the calibrated predicted model errors

                show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)

            Returns:
                None

        plot_real_vs_predicted_error: Sometimes called the RvE plot, or residual vs. error plot, this method plots the binned RMS residuals as a function of the binned model errors
            Args:
                savepath: (str), string denoting the save path to save the figure to

                model: (mastml.models object), a MAST-ML model object, e.g. SklearnModel or EnsembleModel

                data_type: (str), string denoting the data type, e.g. train, test, leftout

                model_errors: (pd.Series), series containing the predicted model errors

                residuals: (pd.Series), series containing the true errors (model residuals)

                dataset_stdev: (float), the standard deviation of the training dataset

                show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)

                is_calibrated: (bool), whether or not the model errors have been recalibrated (default False)

                well_sampled_fraction: (float), number denoting whether a bin qualifies as well-sampled or not. Default to 0.025 (2.5% of total samples). Only affects visuals, not fitting

            Returns:
                None

        plot_real_vs_predicted_error_uncal_cal_overlay: Method for making the residual vs. error plot for two cases together: using the as-obtained uncalibrated model errors and calibrated errors
            Args:
                savepath: (str), string denoting the save path to save the figure to

                model: (mastml.models object), a MAST-ML model object, e.g. SklearnModel or EnsembleModel

                data_type: (str), string denoting the data type, e.g. train, test, leftout

                model_errors: (pd.Series), series containing the predicted model errors

                model_errors_cal: (pd.Series), series containing the calibrated predicted model errors

                residuals: (pd.Series), series containing the true errors (model residuals)

                dataset_stdev: (float), the standard deviation of the training dataset

                show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)

                well_sampled_fraction: (float), number denoting whether a bin qualifies as well-sampled or not. Default to 0.025 (2.5% of total samples). Only affects visuals, not fitting

            Returns:
                None

    """

    @classmethod
    def plot_qq(cls, residuals, savepath, data_type, show_figure, image_dpi=250):
        x_align = 0.64
        fig, ax = make_fig_ax(x_align=x_align)
        fig = sm.qqplot(data=residuals, dist=scipy.stats.distributions.norm, line='45', fit=True,
                            ax=ax, markerfacecolor='blue', markeredgecolor='darkblue',
                            zorder=2, markersize=10, alpha=0.7)
        ax.set_xlim(-5, 5)
        ax.set_ylim(-5, 5)
        ax.set_xlabel('Normal distribution quantiles', fontsize=14)
        ax.set_ylabel('Model residual quantiles', fontsize=14)
        ax.get_lines()[1].set_color("black")
        ax.get_lines()[1].set_linewidth("1.5")
        ax.get_lines()[1].set_linestyle("--")
        ax.xaxis.get_label().set_fontsize(12)
        ax.yaxis.get_label().set_fontsize(12)
        fig.savefig(os.path.join(savepath, 'qq_plot_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')
        if show_figure is True:
            plt.show()
        else:
            plt.close()
        return

[docs]    @classmethod
    def plot_normalized_error(cls, residuals, savepath, data_type, model_errors=None, show_figure=False, file_extension='.csv', image_dpi=250):

        x_align = 0.64
        fig, ax = make_fig_ax(x_align=x_align)
        mu = 0
        sigma = 1
        residuals[residuals == 0.0] = 10**-6
        normalized_residuals = residuals / np.std(residuals)
        density_residuals = gaussian_kde(normalized_residuals)
        x = np.linspace(mu - 5 * sigma, mu + 5 * sigma, residuals.shape[0])
        ax.plot(x, norm.pdf(x, mu, sigma), linewidth=4, color='blue', label="Analytical Gaussian")
        ax.plot(x, density_residuals(x), linewidth=4, color='green', label="Model Residuals")
        maxx = 5
        minn = -5

        if model_errors is not None:
            model_errors[model_errors == 0.0] = 0.0001
            rstat = residuals / model_errors
            density_errors = gaussian_kde(rstat)
            maxy = max(max(density_residuals(x)), max(norm.pdf(x, mu, sigma)), max(density_errors(x)))
            miny = min(min(density_residuals(x)), min(norm.pdf(x, mu, sigma)), max(density_errors(x)))
            ax.plot(x, density_errors(x), linewidth=4, color='purple', label="Model Errors")
            # Save data to csv file
            data_dict = {"Plotted x values": x, "model_errors": model_errors,
                         # "analytical gaussian (plotted y blue values)": norm.pdf(x, mu, sigma),
                         "residuals": residuals,
                         "model normalized residuals (plotted y green values)": density_residuals(x),
                         "model errors (plotted y purple values)": density_errors(x)}
        else:
            # Save data to csv file
            data_dict = {"x values": x,
                         # "analytical gaussian": norm.pdf(x, mu, sigma),
                         "model normalized residuals (plotted y green values)": density_residuals(x)}
            maxy = max(max(density_residuals(x)), max(norm.pdf(x, mu, sigma)))
            miny = min(min(density_residuals(x)), min(norm.pdf(x, mu, sigma)))

        if file_extension == '.xlsx':
            pd.DataFrame(data_dict).to_excel(os.path.join(savepath, 'normalized_error_data_'+str(data_type)+'.xlsx'))
        elif file_extension == '.csv':
            pd.DataFrame(data_dict).to_csv(os.path.join(savepath, 'normalized_error_data_' + str(data_type) + '.csv'))
        ax.legend(loc=0, fontsize=12, frameon=False)
        ax.set_xlabel(r"$\mathrm{x}/\mathit{\sigma}$", fontsize=18)
        ax.set_ylabel("Probability density", fontsize=18)
        _set_tick_labels_different(ax, maxx, minn, maxy, miny)
        fig.savefig(os.path.join(savepath, 'normalized_errors_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')
        if show_figure is True:
            plt.show()
        else:
            plt.close()
        return

[docs]    @classmethod
    def plot_cumulative_normalized_error(cls, residuals, savepath, data_type, model_errors=None, show_figure=False, file_extension='.csv', image_dpi=250):

        x_align = 0.64
        fig, ax = make_fig_ax(x_align=x_align)

        analytic_gau = np.random.normal(0, 1, 10000)
        analytic_gau = abs(analytic_gau)
        n_analytic = np.arange(1, len(analytic_gau) + 1) / np.float(len(analytic_gau))
        X_analytic = np.sort(analytic_gau)
        residuals[residuals == 0.0] = 10 ** -6
        normalized_residuals = abs((residuals) / np.std(residuals))
        n_residuals = np.arange(1, len(normalized_residuals) + 1) / np.float(len(normalized_residuals))
        X_residuals = np.sort(normalized_residuals)  # r"$\mathrm{Predicted \/ Value}, \mathit{eV}$"
        ax.set_xlabel(r"$\mathrm{x}/\mathit{\sigma}$", fontsize=18)
        ax.set_ylabel("Fraction", fontsize=18)
        ax.step(X_residuals, n_residuals, linewidth=3, color='green', label="Model Residuals")
        ax.step(X_analytic, n_analytic, linewidth=3, color='blue', label="Analytical Gaussian")
        ax.set_xlim([0, 5])

        if model_errors is not None:
            model_errors[model_errors == 0.0] = 0.0001
            rstat = abs((residuals) / model_errors)
            n_errors = np.arange(1, len(rstat) + 1) / np.float(len(rstat))
            X_errors = np.sort(rstat)
            ax.step(X_errors, n_errors, linewidth=3, color='purple', label="Model Errors")
            # Save data to csv file
            data_dict = {  # "Analytical Gaussian values": analytic_gau,
                # "Analytical Gaussian (sorted, blue data)": X_analytic,
                "residuals": residuals,
                "normalized residuals": normalized_residuals,
                "Model Residuals (sorted, green data)": X_residuals,
                "Model error values (r value: (ytrue-ypred)/(model error avg))": rstat,
                "Model errors (sorted, purple values)": X_errors}
        else:
            # Save data to csv file
            data_dict = {  # "x analytical": X_analytic,
                # "analytical gaussian": n_analytic,
                "Model Residuals (sorted, green data)": X_residuals,
                "model residuals": n_residuals}
        # Save this way to avoid issue with different array sizes in data_dict
        df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data_dict.items()]))
        if file_extension == '.xlsx':
            df.to_excel(os.path.join(savepath, 'cumulative_normalized_errors_'+str(data_type)+'.xlsx'), index=False)
        elif file_extension == '.csv':
            df.to_csv(os.path.join(savepath, 'cumulative_normalized_errors_'+str(data_type)+'.csv'), index=False)

        ax.legend(loc=0, fontsize=14, frameon=False)
        xlabels = np.linspace(2, 3, 3)
        ylabels = np.linspace(0.9, 1, 2)
        axin = zoomed_inset_axes(ax, 2.5, loc=7)
        axin.step(X_residuals, n_residuals, linewidth=3, color='green', label="Model Residuals")
        axin.step(X_analytic, n_analytic, linewidth=3, color='blue', label="Analytical Gaussian")
        if model_errors is not None:
            axin.step(X_errors, n_errors, linewidth=3, color='purple', label="Model Errors")
        axin.set_xticklabels(xlabels, fontsize=8, rotation=90)
        axin.set_yticklabels(ylabels, fontsize=8)
        axin.set_xlim([2, 3])
        axin.set_ylim([0.9, 1])

        maxx = 5
        minn = 0
        maxy = 1.1
        miny = 0
        _set_tick_labels_different(ax, maxx, minn, maxy, miny)

        mark_inset(ax, axin, loc1=1, loc2=2)
        fig.savefig(os.path.join(savepath, 'cumulative_normalized_errors_'+str(data_type)+'.png'), dpi=image_dpi, bbox_inches='tight')
        if show_figure is True:
            plt.show()
        else:
            plt.close()
        return

[docs]    @classmethod
    def plot_rstat(cls, savepath, data_type, residuals, model_errors, show_figure=False, is_calibrated=False, image_dpi=250):

        # Eliminate model errors with value 0, so that the ratios can be calculated
        zero_indices = []
        for i in range(0, len(model_errors)):
            if model_errors[i] == 0:
                zero_indices.append(i)
        residuals = np.delete(residuals, zero_indices)
        model_errors = np.delete(model_errors, zero_indices)
        # make data for gaussian plot
        gaussian_x = np.linspace(-5, 5, 1000)
        # create plot
        x_align = 0.64
        fig, ax = make_fig_ax(x_align=x_align)
        ax.set_xlabel('residuals / model error estimates')
        ax.set_ylabel('relative counts')
        ax.hist(residuals/model_errors, bins=30, color='blue', edgecolor='black', density=True)
        ax.plot(gaussian_x, stats.norm.pdf(gaussian_x, 0, 1), label='Gaussian mu: 0 std: 1', color='black', linestyle='--', linewidth=1.5)
        ax.text(0.05, 0.9, 'mean = %.3f' % (np.mean(residuals / model_errors)), transform=ax.transAxes)
        ax.text(0.05, 0.85, 'std = %.3f' % (np.std(residuals / model_errors)), transform=ax.transAxes)

        if is_calibrated == False:
            calibrate = 'uncalibrated'
        if is_calibrated == True:
            calibrate = 'calibrated'

        fig.savefig(os.path.join(savepath, 'rstat_histogram_'+str(data_type)+'_'+calibrate+'.png'), dpi=image_dpi, bbox_inches='tight')

        if show_figure is True:
            plt.show()
        else:
            plt.close()
        return

[docs]    @classmethod
    def plot_rstat_uncal_cal_overlay(cls, savepath, data_type, residuals, model_errors, model_errors_cal,
                                     show_figure=False, image_dpi=250):

        # Eliminate model errors with value 0, so that the ratios can be calculated
        zero_indices = []
        for i in range(0, len(model_errors)):
            if model_errors[i] == 0:
                zero_indices.append(i)
        residuals = np.delete(residuals, zero_indices)
        model_errors = np.delete(model_errors, zero_indices)
        model_errors_cal = np.delete(model_errors_cal, zero_indices)

        # make data for gaussian plot
        gaussian_x = np.linspace(-5, 5, 1000)
        # create plot
        x_align = 0.64
        fig, ax = make_fig_ax(x_align=x_align)
        ax.set_xlabel('residuals / model error estimates')
        ax.set_ylabel('relative counts')
        ax.hist(residuals/model_errors, bins=30, color='gray', edgecolor='black', density=True, alpha=0.4)
        ax.hist(residuals/model_errors_cal, bins=30, color='blue', edgecolor='black', density=True, alpha=0.4)
        ax.plot(gaussian_x, stats.norm.pdf(gaussian_x, 0, 1), label='Gaussian mu: 0 std: 1', color='black', linestyle='--', linewidth=1.5)
        ax.text(0.05, 0.9, 'mean = %.3f' % (np.mean(residuals / model_errors)), transform=ax.transAxes, fontdict={'fontsize': 10, 'color': 'gray'})
        ax.text(0.05, 0.85, 'std = %.3f' % (np.std(residuals / model_errors)), transform=ax.transAxes, fontdict={'fontsize': 10, 'color': 'gray'})
        ax.text(0.05, 0.8, 'mean = %.3f' % (np.mean(residuals / model_errors_cal)), transform=ax.transAxes, fontdict={'fontsize': 10, 'color': 'blue'})
        ax.text(0.05, 0.75, 'std = %.3f' % (np.std(residuals / model_errors_cal)), transform=ax.transAxes, fontdict={'fontsize': 10, 'color': 'blue'})
        fig.savefig(os.path.join(savepath, 'rstat_histogram_'+str(data_type)+'_uncal_cal_overlay.png'), dpi=image_dpi, bbox_inches='tight')

        if show_figure is True:
            plt.show()
        else:
            plt.close()
        return

[docs]    @classmethod
    def plot_real_vs_predicted_error(cls, savepath, model, data_type, model_errors, residuals, dataset_stdev,
                                     show_figure=False, is_calibrated=False, well_sampled_number=30, image_dpi=250):

        bin_values, rms_residual_values, num_values_per_bin, number_of_bins, ms_residual_values, var_sq_residual_values = ErrorUtils()._parse_error_data(model_errors=model_errors,
                                                                                                             residuals=residuals,
                                                                                                             dataset_stdev=dataset_stdev)

        model_name = model.model.__class__.__name__
        if model_name == 'RandomForestRegressor':
            model_type = 'RF'
        elif model_name == 'GradientBoostingRegressor':
            model_type = 'GBR'
        elif model_name == 'ExtraTreesRegressor':
            model_type = 'ET'
        elif model_name == 'GaussianProcessRegressor':
            model_type = 'GPR'
        elif model_name == 'BaggingRegressor':
            model_type = 'BR'
        elif model_name == 'AdaBoostRegressor':
            model_type = 'ABR'

        if data_type not in ['train', 'test', 'leaveout']:
            print('Error: data_test_type must be one of "train", "test" or "leaveout"')
            exit()

        # Make RF error plot
        fig, ax = make_fig_ax(aspect_ratio=0.5, x_align=0.65)

        linear = LinearRegression(fit_intercept=True)
        # Fit just blue circle data
        # Find nan entries
        nans = np.argwhere(np.isnan(rms_residual_values)).tolist()

        # use nans (which are indices) to delete relevant parts of bin_values and
        # rms_residual_values as they can't be used to fit anyway
        bin_values_copy = np.empty_like(bin_values)
        bin_values_copy[:] = bin_values
        rms_residual_values_copy = np.empty_like(rms_residual_values)
        rms_residual_values_copy[:] = rms_residual_values
        bin_values_copy = np.delete(bin_values_copy, nans)
        rms_residual_values_copy = np.delete(rms_residual_values_copy, nans)

        num_values_per_bin_copy = np.array(num_values_per_bin)[np.array(num_values_per_bin) != 0]

        # Only examine the bins that are well-sampled, i.e. have number of data points in them above a given threshold
        #well_sampled_number = round(well_sampled_fraction*np.sum(num_values_per_bin_copy))
        rms_residual_values_wellsampled = rms_residual_values_copy[np.where(num_values_per_bin_copy > well_sampled_number)]
        bin_values_wellsampled = bin_values_copy[np.where(num_values_per_bin_copy > well_sampled_number)]
        num_values_per_bin_wellsampled = num_values_per_bin_copy[np.where(num_values_per_bin_copy > well_sampled_number)]
        rms_residual_values_poorlysampled = rms_residual_values_copy[np.where(num_values_per_bin_copy <= well_sampled_number)]
        bin_values_poorlysampled = bin_values_copy[np.where(num_values_per_bin_copy <= well_sampled_number)]
        num_values_per_bin_poorlysampled = num_values_per_bin_copy[np.where(num_values_per_bin_copy <= well_sampled_number)]

        yerr = list()
        for i, j, k in zip(var_sq_residual_values, num_values_per_bin, rms_residual_values):
            if j > 1:
                yerr.append(np.sqrt(i) / (2 * np.sqrt(j) * k))
            else:
                yerr.append(1)
        yerr = np.array(yerr)

        yerr_wellsampled = yerr[np.where(num_values_per_bin > well_sampled_number)[0]]
        yerr_poorlysampled = yerr[np.where(num_values_per_bin <= well_sampled_number)[0]]

        ax.scatter(bin_values_wellsampled, rms_residual_values_wellsampled, s=80, color='blue', alpha=0.7)
        ax.scatter(bin_values_poorlysampled, rms_residual_values_poorlysampled, s=40, color='blue', alpha=0.3)
        ax.errorbar(bin_values_wellsampled, rms_residual_values_wellsampled, yerr=yerr_wellsampled, ecolor='blue', capsize=2, linewidth=0, elinewidth=1)
        ax.errorbar(bin_values_poorlysampled, rms_residual_values_poorlysampled, yerr=yerr_poorlysampled, ecolor='blue', capsize=2, linewidth=0, elinewidth=1, alpha=0.4)

        ax.set_xlabel(str(model_type) + ' model errors / dataset stdev', fontsize=12)
        ax.set_ylabel('RMS Absolute residuals\n / dataset stdev', fontsize=12)
        ax.tick_params(labelsize=10)

        if not rms_residual_values_copy.size:
            print("---WARNING: ALL ERRORS TOO LARGE FOR PLOTTING---")
            exit()
        else:
            # Fit the line to all data, including the poorly sampled data, and weight data points by number of samples per bin
            linear.fit(np.array(bin_values_copy).reshape(-1, 1), rms_residual_values_copy,
                       sample_weight=num_values_per_bin_copy)
            yfit = linear.predict(np.array(bin_values_copy).reshape(-1, 1))
            ax.plot(bin_values_copy, yfit, 'k--', linewidth=2)
            r2 = r2_score(rms_residual_values_copy, yfit, sample_weight=num_values_per_bin_copy)

            slope = linear.coef_
            intercept = linear.intercept_

        divider = make_axes_locatable(ax)
        axbarx = divider.append_axes("top", 1.2, pad=0.12, sharex=ax)

        axbarx.bar(x=bin_values, height=num_values_per_bin, width=bin_values[1]-bin_values[0], color='blue', edgecolor='black', alpha=0.7)
        axbarx.tick_params(labelsize=10, axis='y')
        axbarx.tick_params(labelsize=0, axis='x')
        axbarx.set_ylabel('Counts', fontsize=12)

        total_samples = sum(num_values_per_bin)

        #xmax = max(max(bin_values_copy) + 0.05, 1.6)
        #ymax = max(1.3, max(rms_residual_values))

        xmax = round(max(bin_values_copy) * 1.10, 2)
        ymax = round(max(rms_residual_values) * 1.10, 2)

        axbarx.text(0.6 * xmax, round(0.9 * max(num_values_per_bin)), 'Total counts = ' + str(total_samples),
                    fontsize=10)

        ax.set_ylim(bottom=0, top=ymax)
        axbarx.set_ylim(bottom=0, top=round(max(num_values_per_bin) + 0.1*max(num_values_per_bin)))
        ax.set_xlim(left=0, right=xmax)

        ax.text(0.02, 0.9*ymax, 'R$^2$ = %3.2f ' % r2, fontdict={'fontsize': 10, 'color': 'k'})
        ax.text(0.02, 0.8*ymax, 'slope = %3.2f ' % slope, fontdict={'fontsize': 10, 'color': 'k'})
        ax.text(0.02, 0.7*ymax, 'intercept = %3.2f ' % intercept, fontdict={'fontsize': 10, 'color': 'k'})

        # Plot y = x line as reference point
        maxx = max(xmax, ymax)
        ax.plot([0, maxx], [0, maxx], 'k--', lw=2, zorder=1, color='gray', alpha=0.5)

        if is_calibrated == False:
            calibrate = 'uncalibrated'
        if is_calibrated == True:
            calibrate = 'calibrated'

        fig.savefig(os.path.join(savepath, str(model_type) + '_residuals_vs_modelerror_' + str(data_type) + '_' + calibrate + '.png'),
                    dpi=image_dpi, bbox_inches='tight')

        if show_figure is True:
            plt.show()
        else:
            plt.close()

        return

[docs]    @classmethod
    def plot_real_vs_predicted_error_uncal_cal_overlay(cls, savepath, model, data_type, model_errors, model_errors_cal,
                                                       residuals, dataset_stdev, show_figure=False,
                                                       well_sampled_number=30, image_dpi=250):

        bin_values_uncal, rms_residual_values_uncal, num_values_per_bin_uncal, number_of_bins_uncal, ms_residual_values_uncal, var_sq_residual_values_uncal = ErrorUtils()._parse_error_data(model_errors=model_errors,
                                                                                                                                     residuals=residuals,
                                                                                                                                     dataset_stdev=dataset_stdev)

        bin_values_cal, rms_residual_values_cal, num_values_per_bin_cal, number_of_bins_cal, ms_residual_values_cal, var_sq_residual_values_cal = ErrorUtils()._parse_error_data(model_errors=model_errors_cal,
                                                                                                                             residuals=residuals,
                                                                                                                             dataset_stdev=dataset_stdev)

        model_name = model.model.__class__.__name__
        if model_name == 'RandomForestRegressor':
            model_type = 'RF'
        elif model_name == 'GradientBoostingRegressor':
            model_type = 'GBR'
        elif model_name == 'ExtraTreesRegressor':
            model_type = 'ET'
        elif model_name == 'GaussianProcessRegressor':
            model_type = 'GPR'
        elif model_name == 'BaggingRegressor':
            model_type = 'BR'
        elif model_name == 'AdaBoostRegressor':
            model_type = 'ABR'

        if data_type not in ['train', 'test', 'leaveout']:
            print('Error: data_test_type must be one of "train", "test" or "leaveout"')
            exit()

        # Make RF error plot
        fig, ax = make_fig_ax(aspect_ratio=0.5, x_align=0.65)

        linear_uncal = LinearRegression(fit_intercept=True)
        linear_cal = LinearRegression(fit_intercept=True)

        # Only examine the bins that are well-sampled, i.e. have number of data points in them above a given threshold
        #well_sampled_number_uncal = round(well_sampled_fraction*np.sum(num_values_per_bin_uncal))
        rms_residual_values_wellsampled_uncal = rms_residual_values_uncal[np.where(num_values_per_bin_uncal > well_sampled_number)[0]]
        bin_values_wellsampled_uncal = bin_values_uncal[np.where(num_values_per_bin_uncal > well_sampled_number)[0]]
        num_values_per_bin_wellsampled_uncal = num_values_per_bin_uncal[np.where(num_values_per_bin_uncal > well_sampled_number)[0]]
        rms_residual_values_poorlysampled_uncal = rms_residual_values_uncal[np.where(num_values_per_bin_uncal <= well_sampled_number)[0]]
        bin_values_poorlysampled_uncal = bin_values_uncal[np.where(num_values_per_bin_uncal <= well_sampled_number)[0]]
        num_values_per_bin_poorlysampled_uncal = num_values_per_bin_uncal[np.where(num_values_per_bin_uncal <= well_sampled_number)[0]]

        yerr_uncal = list()
        for i, j, k in zip(var_sq_residual_values_uncal, num_values_per_bin_uncal, rms_residual_values_uncal):
            if j > 1:
                yerr_uncal.append(np.sqrt(i) / (2 * np.sqrt(j) * k))
            else:
                yerr_uncal.append(1)
        yerr_uncal = np.array(yerr_uncal)

        yerr_cal = list()
        for i, j, k in zip(var_sq_residual_values_cal, num_values_per_bin_cal, rms_residual_values_cal):
            if j > 1:
                yerr_cal.append(np.sqrt(i) / (2 * np.sqrt(j) * k))
            else:
                yerr_cal.append(1)
        yerr_cal = np.array(yerr_cal)

        yerr_wellsampled_uncal = yerr_uncal[np.where(num_values_per_bin_uncal > well_sampled_number)[0]]
        yerr_poorlysampled_uncal = yerr_uncal[np.where(num_values_per_bin_uncal <= well_sampled_number)[0]]

        #well_sampled_number_cal = round(well_sampled_fraction * np.sum(num_values_per_bin_cal))
        rms_residual_values_wellsampled_cal = rms_residual_values_cal[np.where(num_values_per_bin_cal > well_sampled_number)[0]]
        bin_values_wellsampled_cal = bin_values_cal[np.where(num_values_per_bin_cal > well_sampled_number)]
        num_values_per_bin_wellsampled_cal = num_values_per_bin_cal[np.where(num_values_per_bin_cal > well_sampled_number)[0]]
        rms_residual_values_poorlysampled_cal = rms_residual_values_cal[np.where(num_values_per_bin_cal <= well_sampled_number)[0]]
        bin_values_poorlysampled_cal = bin_values_cal[np.where(num_values_per_bin_cal <= well_sampled_number)[0]]
        num_values_per_bin_poorlysampled_cal = num_values_per_bin_cal[np.where(num_values_per_bin_cal <= well_sampled_number)[0]]

        yerr_wellsampled_cal = yerr_cal[np.where(num_values_per_bin_cal > well_sampled_number)[0]]
        yerr_poorlysampled_cal = yerr_cal[np.where(num_values_per_bin_cal <= well_sampled_number)[0]]

        ax.scatter(bin_values_wellsampled_uncal, rms_residual_values_wellsampled_uncal, s=80, color='gray', edgecolor='gray', alpha=0.7, label='uncalibrated')
        ax.scatter(bin_values_poorlysampled_uncal, rms_residual_values_poorlysampled_uncal, s=40, color='gray', edgecolor='gray', alpha=0.4)
        ax.errorbar(bin_values_wellsampled_uncal, rms_residual_values_wellsampled_uncal, yerr=yerr_wellsampled_uncal, ecolor='gray', capsize=2, linewidth=0, elinewidth=1)
        ax.errorbar(bin_values_poorlysampled_uncal, rms_residual_values_poorlysampled_uncal, yerr=yerr_poorlysampled_uncal, ecolor='gray', capsize=2, linewidth=0, elinewidth=1, alpha=0.4)

        ax.scatter(bin_values_wellsampled_cal, rms_residual_values_wellsampled_cal, s=80, color='blue', edgecolor='blue', alpha=0.7, label='calibrated')
        ax.scatter(bin_values_poorlysampled_cal, rms_residual_values_poorlysampled_cal, s=40, color='blue', edgecolor='blue', alpha=0.4)
        ax.errorbar(bin_values_wellsampled_cal, rms_residual_values_wellsampled_cal, yerr=yerr_wellsampled_cal, ecolor='blue', capsize=2, linewidth=0, elinewidth=1)
        ax.errorbar(bin_values_poorlysampled_cal, rms_residual_values_poorlysampled_cal, yerr=yerr_poorlysampled_cal, ecolor='blue', capsize=2, linewidth=0, elinewidth=1, alpha=0.4)

        ax.set_xlabel(str(model_type) + ' model errors / dataset stdev', fontsize=12)
        ax.set_ylabel('RMS Absolute residuals\n / dataset stdev', fontsize=12)
        ax.tick_params(labelsize=10)

        # Fit the line to all data, including the poorly sampled data, and weight data points by number of samples per bin
        linear_uncal.fit(np.array(bin_values_uncal).reshape(-1, 1), rms_residual_values_uncal,
                         sample_weight=num_values_per_bin_uncal)
        yfit_uncal = linear_uncal.predict(np.array(bin_values_uncal).reshape(-1, 1))
        ax.plot(bin_values_uncal, yfit_uncal, 'gray', linewidth=2)
        r2_uncal = r2_score(rms_residual_values_uncal, yfit_uncal, sample_weight=num_values_per_bin_uncal)

        slope_uncal = linear_uncal.coef_
        intercept_uncal = linear_uncal.intercept_

        # Fit the line to all data, including the poorly sampled data, and weight data points by number of samples per bin
        linear_cal.fit(np.array(bin_values_cal).reshape(-1, 1), rms_residual_values_cal,
                       sample_weight=num_values_per_bin_cal)
        yfit_cal = linear_cal.predict(np.array(bin_values_cal).reshape(-1, 1))
        ax.plot(bin_values_cal, yfit_cal, 'blue', linewidth=2)
        r2_cal = r2_score(rms_residual_values_cal, yfit_cal, sample_weight=num_values_per_bin_cal)

        slope_cal = linear_cal.coef_
        intercept_cal = linear_cal.intercept_

        divider = make_axes_locatable(ax)
        axbarx = divider.append_axes("top", 1.2, pad=0.12, sharex=ax)

        axbarx.bar(x=bin_values_uncal, height=num_values_per_bin_uncal, width=bin_values_uncal[1]-bin_values_uncal[0],
                   color='gray', edgecolor='gray', alpha=0.3)
        axbarx.bar(x=bin_values_cal, height=num_values_per_bin_cal, width=bin_values_cal[1] - bin_values_cal[0],
                   color='blue', edgecolor='blue', alpha=0.3)
        axbarx.tick_params(labelsize=10, axis='y')
        axbarx.tick_params(labelsize=0, axis='x')
        axbarx.set_ylabel('Counts', fontsize=12)

        #xmax = max(max(bin_values_uncal) + 0.05, 1.6)
        #ymax = max(1.3, max(rms_residual_values_uncal))

        xmax = round(max([max(bin_values_uncal), max(bin_values_cal)]) * 1.10, 2)
        ymax = round(max([max(rms_residual_values_uncal), max(rms_residual_values_cal)]) * 1.10, 2)

        axbarx.text(0.6 * xmax, round(0.9 * max(num_values_per_bin_uncal)), 'Total counts = ' + str(sum(num_values_per_bin_uncal)),
                    fontsize=10)

        ax.set_ylim(bottom=0, top=ymax)
        axbarx.set_ylim(bottom=0, top=round(max(num_values_per_bin_uncal) + 0.1*max(num_values_per_bin_uncal)))
        ax.set_xlim(left=0, right=xmax)

        ax.text(0.02, 0.9*ymax, 'R$^2$ = %3.2f ' % r2_uncal, fontdict={'fontsize': 8, 'color': 'gray'})
        ax.text(0.02, 0.8*ymax, 'slope = %3.2f ' % slope_uncal, fontdict={'fontsize': 8, 'color': 'gray'})
        ax.text(0.02, 0.7*ymax, 'intercept = %3.2f ' % intercept_uncal, fontdict={'fontsize': 8, 'color': 'gray'})
        ax.text(0.02, 0.6*ymax, 'R$^2$ = %3.2f ' % r2_cal, fontdict={'fontsize': 8, 'color': 'blue'})
        ax.text(0.02, 0.5*ymax, 'slope = %3.2f ' % slope_cal, fontdict={'fontsize': 8, 'color': 'blue'})
        ax.text(0.02, 0.4*ymax, 'intercept = %3.2f ' % intercept_cal, fontdict={'fontsize': 8, 'color': 'blue'})

        # Plot y = x line as reference point
        maxx = max(xmax, ymax)
        ax.plot([0, maxx], [0, maxx], 'k--', lw=2, color='red', alpha=0.5)

        ax.legend(loc='lower right', fontsize=8)

        fig.savefig(os.path.join(savepath, str(model_type) + '_residuals_vs_modelerror_' + str(data_type) + '_uncal_cal_overlay.png'),
                    dpi=image_dpi, bbox_inches='tight')

        if show_figure is True:
            plt.show()
        else:
            plt.close()

        return


[docs]class Histogram():
    """
    Class to generate histogram plots, such as histograms of residual values

    Args:
        None

    Methods:
        plot_histogram: method to plot a basic histogram of supplied data
            Args:
                df: (pd.DataFrame), dataframe or series of data to plot as a histogram

                savepath: (str), string denoting the save path for the figure image

                file_name: (str), string denoting the character of the file name, e.g. train vs. test

                x_label: (str), string denoting the  property name

                show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)

            Returns:
                None

        plot_residuals_histogram: method to plot a histogram of residual values
            Args:
                y_true: (pd.Series), series of true y data

                y_pred: (pd.Series), series of predicted y data

                savepath: (str), string denoting the save path for the figure image

                file_name: (str), string denoting the character of the file name, e.g. train vs. test

                show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)

            Returns:
                None

        _get_histogram_bins: Method to obtain the number of bins to use when plotting a histogram
            Args:
                df: (pandas Series or numpy array), array of y data used to construct histogram

            Returns:
                num_bins: (int), the number of bins to use when plotting a histogram

    """
[docs]    @classmethod
    def plot_histogram(cls, df, savepath, file_name, x_label, show_figure=False, file_extension='.csv', image_dpi=250):
        # Make the dataframe 1D if it isn't
        df = check_dimensions(df)

        # make fig and ax, use x_align when placing text so things don't overlap
        x_align = 0.70
        fig, ax = make_fig_ax(aspect_ratio=0.5, x_align=x_align)

        # Get num_bins using smarter method
        num_bins = cls._get_histogram_bins(df=df)

        # do the actual plotting
        ax.hist(df, bins=num_bins, color='b', edgecolor='k')

        # normal text stuff
        ax.set_xlabel(x_label, fontsize=14)
        ax.set_ylabel('Number of occurrences', fontsize=14)

        plot_stats(fig, dict(df.describe()), x_align=x_align, y_align=0.90, fontsize=12)

        # Save data to excel file and image
        if file_extension == '.xlsx':
            df.to_excel(os.path.join(savepath, file_name + '.xlsx'))
            df.describe().to_excel(os.path.join(savepath, file_name + '_statistics.xlsx'))
        elif file_extension == '.csv':
            df.to_csv(os.path.join(savepath, file_name + '.csv'))
            df.describe().to_csv(os.path.join(savepath, file_name + '_statistics.csv'))
        fig.savefig(os.path.join(savepath, file_name + '.png'), dpi=image_dpi, bbox_inches='tight')
        if show_figure == True:
            plt.show()
        else:
            plt.close()
        return

[docs]    @classmethod
    def plot_residuals_histogram(cls, y_true, y_pred, savepath, show_figure=False, file_extension='.csv', image_dpi=250, file_name='residual_histogram'):
        y_true = check_dimensions(y_true)
        y_pred = check_dimensions(y_pred)
        residuals = y_pred-y_true
        cls.plot_histogram(df=residuals,
                           savepath=savepath,
                           file_name=file_name,
                           x_label='Residuals',
                           show_figure=show_figure,
                           file_extension=file_extension,
                           image_dpi=image_dpi)
        return

    @classmethod
    def _get_histogram_bins(cls, df):

        bin_dividers = np.linspace(df.shape[0], 0.05*df.shape[0], df.shape[0])
        bin_list = list()
        try:
            for divider in bin_dividers:
                if divider == 0:
                    continue
                bins = int((df.shape[0])/divider)
                if bins < df.shape[0]/2:
                    bin_list.append(bins)
        except:
            num_bins = 10
        if len(bin_list) > 0:
            num_bins = max(bin_list)
        else:
            num_bins = 10
        return num_bins


[docs]class Line():
    '''
    Class containing methods for constructing line plots

    Args:
        None

    Methods:
        plot_learning_curve: Method used to plot both data and feature learning curves

        Args:
            train_sizes: (numpy array), array of x-axis values, such as fraction of data used or number of features

            train_mean: (numpy array), array of training data mean values, averaged over some type/number of CV splits

            test_mean: (numpy array), array of test data mean values, averaged over some type/number of CV splits

            train_stdev: (numpy array), array of training data standard deviation values, from some type/number of CV splits

            test_stdev: (numpy array), array of test data standard deviation values, from some type/number of CV splits

            score_name: (str), type of score metric for learning curve plotting; used in y-axis label

            learning_curve_type: (str), type of learning curve employed: 'sample_learning_curve' or 'feature_learning_curve'

            savepath: (str), path to save the plotted learning curve to

        Returns:
            None

    '''
[docs]    @classmethod
    def plot_learning_curve(cls, train_sizes, train_mean, test_mean, train_stdev, test_stdev, score_name,
                            learning_curve_type, savepath, image_dpi=250):

        # Set image aspect ratio (do custom for learning curve):
        w, h = figaspect(0.75)
        fig = Figure(figsize=(w, h))
        FigureCanvas(fig)
        gs = plt.GridSpec(1, 1)
        ax = fig.add_subplot(gs[0:, 0:])

        max_x = max(train_sizes)
        min_x = min(train_sizes)

        max_y, min_y = recursive_max_and_min([
            train_mean,
            train_mean + train_stdev,
            train_mean - train_stdev,
            test_mean,
            test_mean + test_stdev,
            test_mean - test_stdev,
        ])

        max_x = round(float(max_x), rounder(max_x - min_x))
        min_x = round(float(min_x), rounder(max_x - min_x))
        max_y = round(float(max_y), rounder(max_y - min_y))
        min_y = round(float(min_y), rounder(max_y - min_y))
        _set_tick_labels_different(ax, max_x, min_x, max_y, min_y)

        # plot and collect handles h1 and h2 for making legend
        h1 = ax.plot(train_sizes, train_mean, '-o', color='blue', markersize=10, alpha=0.7)[0]
        ax.fill_between(train_sizes, train_mean - train_stdev, train_mean + train_stdev,
                        alpha=0.1, color='blue')
        h2 = ax.plot(train_sizes, test_mean, '-o', color='red', markersize=10, alpha=0.7)[0]
        ax.fill_between(train_sizes, test_mean - test_stdev, test_mean + test_stdev,
                        alpha=0.1, color='red')
        ax.legend([h1, h2], ['train score', 'validation score'], loc='center right', fontsize=12)
        if learning_curve_type == 'data_learning_curve':
            ax.set_xlabel('Number of training data points', fontsize=16)
        elif learning_curve_type == 'feature_learning_curve':
            ax.set_xlabel('Number of features selected', fontsize=16)
        else:
            raise ValueError(
                'The param "learning_curve_type" must be either "data_learning_curve" or "feature_learning_curve"')
        ax.set_ylabel(score_name, fontsize=16)
        fig.savefig(os.path.join(savepath, learning_curve_type + '.png'), dpi=image_dpi, bbox_inches='tight')
        return


[docs]class Classification():
    '''
    Classification plots

    Args:
        None

    Methods:
        plot_classification_report: Method used to plot the classification report

        Args:
            savepath: (str), path to save the plotted learning curve to

            data_type: (str), string denoting the data type, e.g. train, test, leftout

            y_true: (pd.Series), series of true y data

            y_pred: (pd.Series), series of predicted y data

            show_figure: (bool), whether or not to show the figure output (e.g. when using Jupyter notebook)

        Returns:
            None

    '''

[docs]    def createClassificationReport(savepath, report_dict, show_figure=False, data_type=''):
        # Parse report_dict data for export
        class_metrics = list(report_dict.items())[:-3]
        all_metric_names = list(class_metrics[0][1].keys())
        class_names = [x[0] for x in class_metrics]
        all_class_values = [list(x[1].values()) for x in class_metrics]
        class_values_no_support = np.array([list(x[1].values())[:-1] for x in class_metrics])
        metrics = list(report_dict.items())[-3:]
        meta_names = [m[0] for m in metrics]
        meta_values = np.array([([m[1]] if isinstance(m[1], float) else list(m[1].values())) for m in metrics], dtype=object)
        report_as_array = []

        report_as_array.append([""]+all_metric_names)
        for i in range(len(class_names)):
            report_as_array.append([class_names[i]] + all_class_values[i])
        for i in range(len(meta_names)):
            report_as_array.append([meta_names[i]] + meta_values[i])

        # EXCEL
        pd.DataFrame(report_as_array).to_excel(os.path.join(savepath, 'classification_report_'+str(data_type)+'.xlsx'), index=False, header=None)

        # PLOT
        plot_metric_names = all_metric_names[:-1]

        dimensions = max(5, len(class_names))
        fig, ax = plt.subplots(figsize=(dimensions, dimensions))
        im = ax.imshow(class_values_no_support)
        ax.set_xticks(np.arange(len(plot_metric_names)))
        ax.set_yticks(np.arange(len(class_names)))
        ax.set_xticklabels(plot_metric_names)
        ax.set_yticklabels(class_names)
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
                rotation_mode="anchor")
        font = {'family': 'serif',
                'weight': 'normal',
                'size': 10,
                }

        for i in range(len(class_names)):
            for j in range(len(plot_metric_names)):
                text = ax.text(j, i, round(class_values_no_support[i, j], 3),
                               ha="center",
                               va="center",
                               color="w",
                               fontdict=font)

        ax.set_title("Classification Report")
        fig.tight_layout()
        divider = make_axes_locatable(ax)
        cax = divider.append_axes("right", size="15%", pad=0.05)
        plt.colorbar(im, cax=cax)
        fig.savefig(os.path.join(savepath, 'classification_report_'+str(data_type)+'.png'), bbox_inches='tight')
        plt.show() if show_figure else plt.close()

[docs]    @classmethod
    def plot_classification_report(cls, savepath, data_type, y_true, y_pred, show_figure):
        report = classification_report(y_true=y_true, y_pred=y_pred, output_dict=True)
        Classification.createClassificationReport(savepath, report, show_figure, data_type)

[docs]    @classmethod
    def doProba(cls, model, X_test):
        print("="*4 + "doProba" + "="*4)
        print("model:")
        print(model)
        # print("X_test:")
        # print(X_test)
        print(f"hasattr(model, 'predict_proba') {hasattr(model, 'predict_proba')}")

        if hasattr(model, 'predict_proba'):
            try:
                print("predict_proba")
                foo = model.predict_proba(X_test)
                print(foo)
            except NotFittedError as e:
                print(e)
        print("="*18)

[docs]def plot_feature_occurrence(savepath, feature, occurrence):
    """
    Function to plot the occurrence of each feature in all of the splits

    Args:
        savepath: (str), string denoting the path to save output to

        feature: (list) the list of features

        occurrence: (list) the list of occurrence of each feature

    Returns:
        None.
    """
    plt.tick_params(axis='y', labelsize=10)
    plt.barh(feature, occurrence)
    plt.xlabel("Occurrence")
    plt.ylabel("Feature")
    plt.title("Occurrence of Features")
    plt.savefig(os.path.join(savepath, 'Feature_Occurrence.png'))
    plt.clf()
    return

[docs]def plot_avg_score_vs_occurrence(savepath, occurrence, score, std_score):
    """
    Function to plot the average score of each feature against their occurrence in all of the splits

    Args:
        savepath: (str), string denoting the path to save output to

        occurrence: (list), the list of occurrence of each feature

        score: (list), the list of the feature ranking score of each feature

        std_score(list), the list of standard deviation of the feature ranking score

    Returns:
        None.
    """
    plt.plot(occurrence, score, marker='o')
    plt.fill_between(occurrence, score - std_score, score + std_score, alpha=0.1, color='blue')
    plt.title('Avg Score vs Occurrence')
    plt.xlabel('Occurrence')
    plt.ylabel('Avg Score')
    plt.savefig(os.path.join(savepath + '/AvgScoreVsOccurrence.png'))
    plt.clf()
    return

[docs]def make_plots(plots, y_true, y_pred, groups, dataset_stdev, metrics, model, residuals, model_errors, has_model_errors,
               savepath, data_type, X_test=None, show_figure=False, recalibrate_errors=False, model_errors_cal=None, splits_summary=False,
               file_extension='.csv', image_dpi=250):
    """
    Helper function to make collections of different types of plots after a single or multiple data splits are evaluated.

    Args:
        plots: (list of str), list denoting which types of plots to make. Viable entries are "Scatter", "Histogram", "Error"

        y_true: (pd.Series), series containing the true y data

        y_pred: (pd.Series), series containing the predicted y data

        groups: (list), list denoting the group label for each data point

        dataset_stdev: (float), the dataset standard deviation

        metrics: (list of str), list denoting the metric names to evaluate. See mastml.metrics.Metrics.metrics_zoo_ for full list

        model: (mastml.models object), a MAST-ML model object, e.g. SklearnModel or EnsembleModel

        residuals: (pd.Series), series containing the residuals (true model errors)

        model_errors: (pd.Series), series containing the as-obtained uncalibrated model errors

        has_model_errors: (bool), whether the model type used can be subject to UQ and thus have model errors calculated

        savepath: (str), string denoting the path to save output to

        data_type: (str), string denoting the data type analyzed, e.g. train, test, leftout

        show_figure: (bool), whether or not the generated figure is output to the notebook screen (default False)

        recalibrate_errors: (bool), whether or not the model errors have been recalibrated (default False)

        model_errors_cal: (pd.Series), series containing the calibrated predicted model errors

        splits_summary: (bool), whether or not the data used in the plots comes from a collection of many splits (default False), False denotes a single split folder

    Returns:
        None.

    """

    if 'Histogram' in plots:
        try:
            Histogram.plot_residuals_histogram(y_true=y_true,
                                               y_pred=y_pred,
                                               savepath=savepath,
                                               file_name='residual_histogram_'+str(data_type),
                                               show_figure=show_figure,
                                               file_extension=file_extension,
                                               image_dpi=image_dpi)
        except:
            print('Warning: unable to make Histogram.plot_residuals_histogram. Skipping...')
    if 'Scatter' in plots:
        try:
            Scatter.plot_predicted_vs_true(y_true=y_true,
                                           y_pred=y_pred,
                                           savepath=savepath,
                                           x_label='values',
                                           data_type=data_type,
                                           metrics_list=metrics,
                                           show_figure=show_figure,
                                           ebars=None,
                                           file_extension=file_extension,
                                           image_dpi=image_dpi,
                                           groups=None)
            if groups is not None:
                Scatter.plot_predicted_vs_true(y_true=y_true,
                                               y_pred=y_pred,
                                               savepath=savepath,
                                               x_label='values',
                                               data_type=data_type,
                                               metrics_list=metrics,
                                               show_figure=show_figure,
                                               ebars=None,
                                               file_extension=file_extension,
                                               image_dpi=image_dpi,
                                               groups=groups)
            if recalibrate_errors == True:
                Scatter.plot_predicted_vs_true(y_true=y_true,
                                               y_pred=y_pred,
                                               savepath=savepath,
                                               x_label='values',
                                               data_type=data_type,
                                               metrics_list=metrics,
                                               show_figure=show_figure,
                                               ebars=model_errors_cal,
                                               file_extension=file_extension,
                                               image_dpi=image_dpi,
                                               groups=None)
                if groups is not None:
                    Scatter.plot_predicted_vs_true(y_true=y_true,
                                                   y_pred=y_pred,
                                                   savepath=savepath,
                                                   x_label='values',
                                                   data_type=data_type,
                                                   metrics_list=metrics,
                                                   show_figure=show_figure,
                                                   ebars=model_errors_cal,
                                                   file_extension=file_extension,
                                                   image_dpi=image_dpi,
                                                   groups=groups)
        except:
            print('Warning: unable to make Scatter.plot_predicted_vs_true plot. Skipping...')
        if splits_summary is True:
            if data_type != 'leaveout':
                try:
                    Scatter.plot_best_worst_split(savepath=savepath,
                                                  data_type=data_type,
                                                  x_label='values',
                                                  metrics_list=metrics,
                                                  show_figure=show_figure,
                                                  file_extension=file_extension,
                                                  image_dpi=image_dpi)
                except:
                    print('Warning: unable to make Scatter.plot_best_worst_split plot. Skipping...')
                try:
                    Scatter.plot_best_worst_per_point(savepath=savepath,
                                                      data_type=data_type,
                                                      x_label='values',
                                                      metrics_list=metrics,
                                                      show_figure=show_figure,
                                                      file_extension=file_extension,
                                                      image_dpi=image_dpi)
                except:
                    print('Warning: unable to make Scatter.plot_best_worst_per_point plot. Skipping...')
            try:
                Scatter.plot_predicted_vs_true_bars(savepath=savepath,
                                                    data_type=data_type,
                                                    x_label='values',
                                                    metrics_list=metrics,
                                                    show_figure=show_figure,
                                                    file_extension=file_extension,
                                                    image_dpi=image_dpi,
                                                    groups=None)
                if groups is not None:
                    Scatter.plot_predicted_vs_true_bars(savepath=savepath,
                                                        data_type=data_type,
                                                        x_label='values',
                                                        metrics_list=metrics,
                                                        show_figure=show_figure,
                                                        file_extension=file_extension,
                                                        image_dpi=image_dpi,
                                                        groups=groups)
                if recalibrate_errors == True:
                    Scatter.plot_predicted_vs_true_bars(savepath=savepath,
                                                        data_type=data_type,
                                                        x_label='values',
                                                        metrics_list=metrics,
                                                        show_figure=show_figure,
                                                        ebars=model_errors_cal,
                                                        file_extension=file_extension,
                                                        image_dpi=image_dpi,
                                                        groups=None)
                    if groups is not None:
                        Scatter.plot_predicted_vs_true_bars(savepath=savepath,
                                                            data_type=data_type,
                                                            x_label='values',
                                                            metrics_list=metrics,
                                                            show_figure=show_figure,
                                                            ebars=model_errors_cal,
                                                            file_extension=file_extension,
                                                            image_dpi=image_dpi,
                                                            groups=groups)
            except:
                print('Warning: unable to make Scatter.plot_predicted_vs_true_bars plot. Skipping...')

    if 'Error' in plots:
        try:
            Error.plot_qq(residuals=residuals,
                          savepath=savepath,
                          data_type=data_type,
                          show_figure=show_figure,
                          image_dpi=image_dpi)
        except:
            print('Warning: unable to make Error.plot_qq plot. Skipping...')
        try:
            Error.plot_normalized_error(residuals=residuals,
                                        savepath=savepath,
                                        data_type=data_type,
                                        model_errors=model_errors,
                                        show_figure=show_figure,
                                        file_extension=file_extension,
                                        image_dpi=image_dpi)
        except:
            print('Warning: unable to make Error.plot_normalized_error plot. Skipping...')
        try:
            Error.plot_cumulative_normalized_error(residuals=residuals,
                                                   savepath=savepath,
                                                   data_type=data_type,
                                                   model_errors=model_errors,
                                                   show_figure=show_figure,
                                                    file_extension=file_extension,
                                                    image_dpi=image_dpi)
        except:
            print('Warning: unable to make Error.plot_cumulative_normalized_error plot. Skipping...')
        if has_model_errors is True:
            try:
                Error.plot_rstat(savepath=savepath,
                                 data_type=data_type,
                                 model_errors=model_errors,
                                 residuals=residuals,
                                 show_figure=show_figure,
                                 is_calibrated=False,
                                 image_dpi=image_dpi)
            except:
                print('Warning: unable to make Error.plot_rstat plot. Skipping...')
            try:
                Error.plot_real_vs_predicted_error(savepath=savepath,
                                                   model=model,
                                                   data_type=data_type,
                                                   model_errors=model_errors,
                                                   residuals=residuals,
                                                   dataset_stdev=dataset_stdev,
                                                   show_figure=show_figure,
                                                   is_calibrated=False,
                                                    image_dpi=image_dpi)
            except:
                print('Warning: unable to make Error.plot_real_vs_predicted_error plot. Skipping...')
            if recalibrate_errors is True:
                try:
                    Error.plot_rstat(savepath=savepath,
                                     data_type=data_type,
                                     residuals=residuals,
                                     model_errors=model_errors_cal,
                                     show_figure=show_figure,
                                     is_calibrated=True,
                                     image_dpi=image_dpi)
                except:
                    print('Warning: unable to make Error.plot_rstat plot. Skipping...')
                try:
                    Error.plot_rstat_uncal_cal_overlay(savepath=savepath,
                                                       data_type=data_type,
                                                       residuals=residuals,
                                                       model_errors=model_errors,
                                                       model_errors_cal=model_errors_cal,
                                                       show_figure=False,
                                                       image_dpi=image_dpi)
                except:
                    print('Warning: unable to make Error.plot_rstat_uncal_cal_overlay plot. Skipping...')
                try:
                    Error.plot_real_vs_predicted_error(savepath=savepath,
                                                       model=model,
                                                       data_type=data_type,
                                                       residuals=residuals,
                                                       model_errors=model_errors_cal,
                                                       dataset_stdev=dataset_stdev,
                                                       show_figure=show_figure,
                                                       is_calibrated=True,
                                                        image_dpi = image_dpi)
                except:
                    print('Warning: unable to make Error.plot_real_vs_predicted_error plot. Skipping...')
                try:
                    Error.plot_real_vs_predicted_error_uncal_cal_overlay(savepath=savepath,
                                                                         model=model,
                                                                         data_type=data_type,
                                                                         model_errors=model_errors,
                                                                         model_errors_cal=model_errors_cal,
                                                                         residuals=residuals,
                                                                         dataset_stdev=dataset_stdev,
                                                                         show_figure=False,
                                                                         image_dpi=image_dpi)
                except:
                    print('Warning: unable to make Error.plot_real_vs_predicted_error_uncal_cal_overlay plot. Skipping...')
    if 'Classification' in plots:
        Classification.plot_classification_report(savepath=savepath, data_type=data_type, y_true=y_true, y_pred=y_pred, show_figure=show_figure)
        Classification.doProba(model=model, X_test=X_test)
    return


[docs]def check_dimensions(y):
    """
    Method to check the dimensions of supplied data. Plotters need data to be 1D and often data is passed in as 2D

    Args:

        y: (numpy array or pd.DataFrame), array or dataframe of data used for plotting

    Returns:

        y: (pd.Series), series that is now 1D

    """
    if len(y.shape) > 1:
        if type(y) == pd.core.frame.DataFrame:
            y = pd.DataFrame.squeeze(y)
        elif type(y) == np.ndarray:
            y = pd.DataFrame(y.ravel()).squeeze()
            #y = y.ravel()
    else:
        if type(y) == np.ndarray:
            y = pd.DataFrame(y).squeeze()
    return y


[docs]def reset_index(y):
    return pd.DataFrame(np.array(y))


[docs]def trim_array(arr_list):
    """
    Method used to trim a set of arrays to make all arrays the same shape

    Args:

        arr_list: (list), list of numpy arrays, where arrays are different sizes

    Returns:

        arr_list: (), list of trimmed numpy arrays, where arrays are same size

    """

    # TODO: a better way to handle arrays with very different shapes? Otherwise average only uses # of points of smallest array
    # Need to make arrays all same shapes if they aren't
    sizes = [arr.shape[0] for arr in arr_list]
    size_min = min(sizes)
    arr_list_ = list()
    for i, arr in enumerate(arr_list):
        if arr.shape[0] > size_min:
            while arr.shape[0] > size_min:
                arr = np.delete(arr, -1)
        arr_list_.append(arr)
    arr_list = arr_list_
    return arr_list


[docs]def rounder(delta):
    """
    Method to obtain number of decimal places to report on plots

    Args:

        delta: (float), a float representing the change in two y values on a plot, used to obtain the plot axis spacing size

    Return:

        (int), an integer denoting the number of decimal places to use

    """
    if 0.001 <= delta < 0.01:
        return 3
    elif 0.01 <= delta < 0.1:
        return 2
    elif 0.1 <= delta < 1:
        return 1
    elif 1 <= delta < 100000:
        return 0
    else:
        return 0


[docs]def stat_to_string(name, value, nice_names):
    """
    Method that converts a metric object into a string for displaying on a plot

    Args:

        name: (str), long name of a stat metric or quantity

        value: (float), value of the metric or quantity

    Return:

        (str), a string of the metric name, adjusted to look nicer for inclusion on a plot

    """

    " Stringifies the name value pair for display within a plot "
    if name in nice_names:
        name = nice_names[name]
    else:
        name = name.replace('_', ' ')

    # has a name only
    if not value:
        return name
    # has a mean and std
    if isinstance(value, tuple):
        mean, std = value
        return f'{name}:' + '\n\t' + f'{mean:.3f}' + r'$\pm$' + f'{std:.3f}'
    # has a name and value only
    if isinstance(value, int) or (isinstance(value, float) and value % 1 == 0):
        return f'{name}: {int(value)}'
    if isinstance(value, float):
        return f'{name}: {value:.3f}'
    return f'{name}: {value}'  # probably a string


[docs]def plot_stats(fig, stats, x_align=0.65, y_align=0.90, font_dict=dict(), fontsize=14):
    """
    Method that prints stats onto the plot. Goes off screen if they are too long or too many in number.

    Args:

        fig: (matplotlib figure object), a matplotlib figure object

        stats: (dict), dict of statistics to be included with a plot

        x_align: (float), float denoting x position of where to align display of stats on a plot

        y_align: (float), float denoting y position of where to align display of stats on a plot

        font_dict: (dict), dict of matplotlib font options to alter display of stats on plot

        fontsize: (int), the fontsize of stats to display on plot

    Returns:

        None

    """

    stat_str = '\n'.join(stat_to_string(name, value, nice_names=nice_names())
                         for name, value in stats.items())

    fig.text(x_align, y_align, stat_str,
             verticalalignment='top', wrap=True, fontdict=font_dict, fontproperties=FontProperties(size=fontsize))


[docs]def make_fig_ax(aspect_ratio=0.5, x_align=0.65, left=0.10):
    """
    Method to make matplotlib figure and axes objects. Using Object Oriented interface from https://matplotlib.org/gallery/api/agg_oo_sgskip.html

    Args:

        aspect_ratio: (float), aspect ratio for figure and axes creation

        x_align: (float), x position to draw edge of figure. Needed so can display stats alongside plot

        left: (float), the leftmost position to draw edge of figure

    Returns:

        fig: (matplotlib fig object), a matplotlib figure object with the specified aspect ratio

        ax: (matplotlib ax object), a matplotlib axes object with the specified aspect ratio

    """
    # Set image aspect ratio:
    w, h = figaspect(aspect_ratio)
    fig = plt.figure(figsize=(w, h))
    #fig = Figure(figsize=(w, h))
    FigureCanvas(fig)

    # Set custom positioning, see this guide for more details:
    # https://python4astronomers.github.io/plotting/advanced.html
    #left   = 0.10
    bottom = 0.15
    right = 0.01
    top = 0.05
    width = x_align - left - right
    height = 1 - bottom - top
    ax = fig.add_axes((left, bottom, width, height), frameon=True)
    fig.set_tight_layout(False)

    return fig, ax


[docs]def make_fig_ax_square(aspect='equal', aspect_ratio=1):
    """
    Method to make square shaped matplotlib figure and axes objects. Using Object Oriented interface from

    https://matplotlib.org/gallery/api/agg_oo_sgskip.html

    Args:

        aspect: (str), 'equal' denotes x and y aspect will be equal (i.e. square)

        aspect_ratio: (float), aspect ratio for figure and axes creation

    Returns:

        fig: (matplotlib fig object), a matplotlib figure object with the specified aspect ratio

        ax: (matplotlib ax object), a matplotlib axes object with the specified aspect ratio

    """
    # Set image aspect ratio:
    w, h = figaspect(aspect_ratio)
    fig = Figure(figsize=(w, h))
    FigureCanvas(fig)
    ax = fig.add_subplot(111, aspect=aspect)

    return fig, ax


[docs]def make_axis_same(ax, max1, min1):
    """
    Method to make the x and y ticks for each axis the same. Useful for parity plots

    Args:

        ax: (matplotlib axis object), a matplotlib axes object

        max1: (float), the maximum value of a particular axis

        min1: (float), the minimum value of a particular axis

    Returns:

        None

    """
    if max1 - min1 > 5:
        step = (int(max1) - int(min1)) // 3
        ticks = range(int(min1), int(max1)+step, step)
    else:
        ticks = np.linspace(min1, max1, 5)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)


[docs]def nice_mean(ls):
    """
    Method to return mean of a list or equivalent array with NaN values

    Args:

        ls: (list), list of values

    Returns:

        (numpy array), array containing mean of list of values or NaN if list has no values

    """

    if len(ls) > 0:
        return np.mean(ls)
    return np.nan


[docs]def nice_std(ls):
    """
    Method to return standard deviation of a list or equivalent array with NaN values

    Args:

        ls: (list), list of values

    Returns:

        (numpy array), array containing standard deviation of list of values or NaN if list has no values

    """
    if len(ls) > 0:
        return np.std(ls)
    return np.nan


[docs]def round_down(num, divisor):
    """
    Method to return a rounded down number

    Args:

        num: (float), a number to round down

        divisor: (int), divisor to denote how to round down

    Returns:

        (float), the rounded-down number

    """

    return num - (num % divisor)


[docs]def round_up(num, divisor):
    """
    Method to return a rounded up number

    Args:

        num: (float), a number to round up

        divisor: (int), divisor to denote how to round up

    Returns:

        (float), the rounded-up number

    """
    return float(math.ceil(num / divisor)) * divisor


[docs]def get_divisor(high, low):
    """
    Method to obtain a sensible divisor based on range of two values

    Args:

        high: (float), a max data value

        low: (float), a min data value

    Returns:

        divisor: (float), a number used to make sensible axis ticks

    """

    delta = high-low
    divisor = 10
    if delta > 1000:
        divisor = 100
    if delta < 1000:
        if delta > 100:
            divisor = 10
        if delta < 100:
            if delta > 10:
                divisor = 1
            if delta < 10:
                if delta > 1:
                    divisor = 0.1
                if delta < 1:
                    if delta > 0.01:
                        divisor = 0.001
                else:
                    divisor = 0.001
    return divisor


[docs]def recursive_max(arr):
    """
    Method to recursively find the max value of an array of iterables.

    Credit: https://www.linkedin.com/pulse/ask-recursion-during-coding-interviews-identify-good-talent-veteanu/

    Args:

        arr: (numpy array), an array of values or iterables

    Returns:

        (float), max value in arr

    """
    return max(
        recursive_max(e) if isinstance(e, Iterable) else e
        for e in arr
    )


[docs]def recursive_min(arr):
    """
    Method to recursively find the min value of an array of iterables.

    Credit: https://www.linkedin.com/pulse/ask-recursion-during-coding-interviews-identify-good-talent-veteanu/

    Args:

        arr: (numpy array), an array of values or iterables

    Returns:

        (float), min value in arr

    """

    return min(
        recursive_min(e) if isinstance(e, Iterable) else e
        for e in arr
    )


[docs]def recursive_max_and_min(arr):
    """
    Method to recursively return max and min of values or iterables in array

    Args:

        arr: (numpy array), an array of values or iterables

    Returns:

        (tuple), tuple containing max and min of arr

    """
    return recursive_max(arr), recursive_min(arr)


def _set_tick_labels(ax, maxx, minn):
    """
    Method that sets the x and y ticks to be in the same range

    Args:

        ax: (matplotlib axes object), a matplotlib axes object

        maxx: (float), a maximum value

        minn: (float), a minimum value

    Returns:

        None

    """
    _set_tick_labels_different(ax, maxx, minn, maxx, minn)  # I love it when this happens


def _set_tick_labels_different(ax, max_tick_x, min_tick_x, max_tick_y, min_tick_y):
    """
    Method that sets the x and y ticks, when the axes have different ranges

    Args:

        ax: (matplotlib axes object), a matplotlib axes object

        max_tick_x: (float), the maximum tick value for the x axis

        min_tick_x: (float), the minimum tick value for the x axis

        max_tick_y: (float), the maximum tick value for the y axis

        min_tick_y: (float), the minimum tick value for the y axis

    Returns:

        None

    """

    tickvals_x = nice_range(min_tick_x, max_tick_x)
    tickvals_y = nice_range(min_tick_y, max_tick_y)

    if tickvals_x[-1]-tickvals_x[len(tickvals_x)-2] < tickvals_x[len(tickvals_x)-3]-tickvals_x[len(tickvals_x)-4]:
        tickvals_x = tickvals_x[:-1]
    if tickvals_y[-1]-tickvals_y[len(tickvals_y)-2] < tickvals_y[len(tickvals_y)-3]-tickvals_y[len(tickvals_y)-4]:
        tickvals_y = tickvals_y[:-1]
    #tickvals_x = _clean_tick_labels(tickvals=tickvals_x, delta=max_tick_x-min_tick_x)
    #tickvals_y = _clean_tick_labels(tickvals=tickvals_y, delta=max_tick_y - min_tick_y)

    ax.set_xticks(ticks=tickvals_x)
    ax.set_yticks(ticks=tickvals_y)

    ticklabels_x = [str(tick) for tick in tickvals_x]
    ticklabels_y = [str(tick) for tick in tickvals_y]

    rotation = 0
    # Look at length of x tick labels to see if may be possibly crowded. If so, rotate labels
    tick_length = len(str(tickvals_x[1]))
    if tick_length >= 4:
        rotation = 45
    ax.set_xticklabels(labels=ticklabels_x, fontsize=14, rotation=rotation)
    ax.set_yticklabels(labels=ticklabels_y, fontsize=14)


def _clean_tick_labels(tickvals, delta):
    """
    Method to attempt to clean up axis tick values so they don't overlap from being too dense

    Args:

        tickvals: (list), a list containing the initial axis tick values

        delta: (float), number representing the numerical difference of two ticks

    Returns:

        tickvals_clean: (list), a list containing the updated axis tick values

    """
    tickvals_clean = list()
    if delta >= 100:
        for i, val in enumerate(tickvals):
            if i <= len(tickvals)-1:
                if tickvals[i]-tickvals[i-1] >= 100:
                    tickvals_clean.append(val)
    else:
        tickvals_clean = tickvals
    return tickvals_clean

# Math utilities to aid plot_helper to make ranges


[docs]def nice_range(lower, upper):
    """
    Method to create a range of values, including the specified start and end points, with nicely spaced intervals

    Args:

        lower: (float or int), lower bound of range to create

        upper: (float or int), upper bound of range to create

    Returns:

        (list), list of numerical values in established range

    """

    flipped = 1  # set to -1 for inverted

    # Case for validation where nan is passed in
    if np.isnan(lower):
        lower = 0
    if np.isnan(upper):
        upper = 0.1

    if upper < lower:
        upper, lower = lower, upper
        flipped = -1
    return [_int_if_int(x) for x in _nice_range_helper(lower, upper)][::flipped]


def _nice_range_helper(lower, upper):
    """
    Method to help make a better range of axis ticks

    Args:

        lower: (float), lower value of axis ticks

        upper: (float), upper value of axis ticks

    Returns:

        upper: (float), modified upper tick value fixed based on set of axis ticks

    """
    steps = 8
    diff = abs(lower - upper)

    # special case where lower and upper are the same
    if diff == 0:
        return [lower, ]

    # the exact step needed
    step = diff / steps

    # a rough estimate of best step
    step = _nearest_pow_ten(step)  # whole decimal increments

    # tune in one the best step size
    factors = [0.1, 0.2, 0.5, 1, 2, 5, 10]

    # use this to minimize how far we are from ideal step size
    def best_one(steps_factor):
        steps_count, factor = steps_factor
        return abs(steps_count - steps)
    n_steps, best_factor = min([(diff / (step * f), f) for f in factors], key=best_one)

    #print('should see n steps', ceil(n_steps + 2))
    # multiply in the optimal factor for getting as close to ten steps as we can
    step = step * best_factor

    # make the bounds look nice
    lower = _three_sigfigs(lower)
    upper = _three_sigfigs(upper)

    start = _round_up(lower, step)

    # prepare for iteration
    x = start  # pointless init
    i = 0

    # itereate until we reach upper
    while x < upper - step:
        x = start + i * step
        yield _three_sigfigs(x)  # using sigfigs because of floating point error
        i += 1

    # finish off with ending bound
    yield upper


def _three_sigfigs(x):
    """
    Method invoking special case of _n_sigfigs to return 3 sig figs

    Args:

        x: (float), an axis tick number

    Returns:

        (float), number of sig figs (always 3)

    """
    return _n_sigfigs(x, 3)


def _n_sigfigs(x, n):
    """
    Method to return number of sig figs to use for axis ticks

    Args:

        x: (float), an axis tick number

    Returns:

        (float), number of sig figs

    """
    sign = 1
    if x == 0:
        return 0
    if x < 0:  # case for negatives
        x = -x
        sign = -1
    if x < 1:
        base = n - round(log(x, 10))
    else:
        base = (n-1) - round(log(x, 10))
    return sign * round(x, base)


def _nearest_pow_ten(x):
    """
    Method to return the nearest power of ten for an axis tick value

    Args:

        x: (float), an axis tick number

    Returns:

        (float), nearest power of ten of x

    """
    sign = 1
    if x == 0:
        return 0
    if x < 0:  # case for negatives
        x = -x
        sign = -1
    return sign*10**ceil(log(x, 10))


def _int_if_int(x):
    """
    Method to return integer mapped value of x

    Args:

        x: (float or int), a number

    Returns:

        x: (float), value of x mapped as integer

    """
    if int(x) == x:
        return int(x)
    return x


def _round_up(x, inc):
    """
    Method to round up the value of x

    Args:

        x: (float or int), a number

        inc: (float), an increment for axis ticks

    Returns:

        (float), value of x rounded up

    """
    sign = 1
    if x < 0:  # case for negative
        x = -x
        sign = -1

    return sign * inc * ceil(x / inc)


[docs]def nice_names():
    nice_names = {
        # classification:
        'accuracy': 'Accuracy',
        'f1_binary': '$F_1$',
        'f1_macro': 'f1_macro',
        'f1_micro': 'f1_micro',
        'f1_samples': 'f1_samples',
        'f1_weighted': 'f1_weighted',
        'log_loss': 'log_loss',
        'precision_binary': 'Precision',
        'precision_macro': 'prec_macro',
        'precision_micro': 'prec_micro',
        'precision_samples': 'prec_samples',
        'precision_weighted': 'prec_weighted',
        'recall_binary': 'Recall',
        'recall_macro': 'rcl_macro',
        'recall_micro': 'rcl_micro',
        'recall_samples': 'rcl_samples',
        'recall_weighted': 'rcl_weighted',
        'roc_auc': 'ROC_AUC',
        # regression:
        'explained_variance': 'expl_var',
        'mean_absolute_error': 'MAE',
        'mean_squared_error': 'MSE',
        'mean_squared_log_error': 'MSLE',
        'median_absolute_error': 'MedAE',
        'root_mean_squared_error': 'RMSE',
        'rmse_over_stdev': r'RMSE/$\sigma_y$',
        'r2_score': '$R^2$',
        'r2_score_noint': '$R^2_{noint}$',
        'r2_score_adjusted': '$R^2_{adjusted}$',
        'r2_score_fitted': '$R^2_{fitted}$'
    }
    return nice_names