Source code for mastml.baseline_tests

"""
This module contains baseline test for models

Baseline_tests:
    Class that contains the tests for the models

"""
import os

import numpy as np
import pandas as pd
import scipy as sp
from scipy.spatial.distance import cdist
import mastml
from mastml.metrics import Metrics

try:
    data_path = os.path.join(mastml.__path__[0], 'data')
except:
    data_path = os.path.join(mastml.__path__._path[0], 'data')


[docs]class Baseline_tests():
    '''
    Methods:
        test_mean: Compares the score of the model with a constant test value
            Args:
                X: (dataframe), dataframe of X features

                y: (dataframe), dataframe of y data

                metrics: (list), list of metric names to evaluate true vs. pred data in each split

            Returns:
                A dataframe of the results of the model for the selected metrics


        test_permuted: Compares the score of the model with a permuted test value
            Args:
                X: (dataframe), dataframe of X features

                y: (dataframe), dataframe of y data

                metrics: (list), list of metric names to evaluate true vs. pred data in each split

            Returns:
                A dataframe of the results of the model for the selected metrics

        test_nearest_neighbour_kdTree: Compares the score of the model with the test value of the nearest neighbour found using kdTree
            Args:
                X: (dataframe), dataframe of X features

                y: (dataframe), dataframe of y data

                metrics: (list), list of metric names to evaluate true vs. pred data in each split

            Returns:
                A dataframe of the results of the model for the selected metrics

        test_nearest_neighbour_cdist: Compares the score of the model with the test value of the nearest neighbour found using cdist
            Args:
                X: (dataframe), dataframe of X features

                y: (dataframe), dataframe of y data

                metrics: (list), list of metric names to evaluate true vs. pred data in each split

                d_metric: Metric to use to calculate the distance. Default is euclidean

            Returns:
                A dataframe of the results of the model for the selected metrics

        test_classifier_random: Compares the score of the model with a test value of a random class
            Args:
                X: (dataframe), dataframe of X features

                y: (dataframe), dataframe of y data

                metrics: (list), list of metric names to evaluate true vs. pred data in each split

            Returns:
                A dataframe of the results of the model for the selected metrics

        test_classifier_dominant: Compares the score of the model with a test value of the dominant class (ie highest count)
            Args:
                X: (dataframe), dataframe of X features

                y: (dataframe), dataframe of y data

                metrics: (list), list of metric names to evaluate true vs. pred data in each split

            Returns:
                A dataframe of the results of the model for the selected metrics


        print_results: Prints the comparison between the naive score and the real score
            Args:
                real_score: The actual score of the model

                naive_score: The naive score of the model tested with fake_test

    '''

[docs]    def test_mean(self, X_train, X_test, y_train, y_test, model, metrics=["mean_absolute_error"]):
        # Let y mean of all the y-data. So, it is just like pretending the predicted value is a constant,
        # equal to the mean
        constant = y_test.mean()
        arr = [constant for i in range(len(y_test))]
        fake_test = pd.Series(arr)

        y_predict = model.predict(X_test)

        # The edge case of predicting a single value
        if isinstance(y_predict, float):
            y_predict = np.array([y_predict])

        real_score = Metrics(metrics_list=metrics).evaluate(y_true=y_test, y_pred=y_predict)
        naive_score = Metrics(metrics_list=metrics).evaluate(y_true=y_test, y_pred=fake_test)
        return self.to_excel(real_score, naive_score)

[docs]    def test_permuted(self, X_train, X_test, y_train, y_test, model, metrics=["mean_absolute_error"]):

        # Shuffling the y-data values to make it so that the X features do not correspond to the correct y data.
        fake_test = y_test.sample(frac=1)
        y_predict = model.predict(X_test)

        # The edge case of predicting a single value
        if isinstance(y_predict, float):
            y_predict = np.array([y_predict])

        real_score = Metrics(metrics_list=metrics).evaluate(y_true=y_test, y_pred=y_predict)
        naive_score = Metrics(metrics_list=metrics).evaluate(y_true=fake_test, y_pred=y_predict)
        return self.to_excel(real_score, naive_score)

[docs]    def test_nearest_neighbour_kdtree(self, X_train, X_test, y_train, y_test, model, metrics = ["mean_absolute_error"]):

        # Use the nearest neighbour datapoint's y_test instead of the actual y_test
        Xdatas = sp.spatial.cKDTree(X_train, leafsize=100)
        XresultDistance, XresultCoordinate = Xdatas.query(X_test)
        fake_test = []
        for i in XresultCoordinate:
            fake_test.append(y_train[i])
        fake_test = pd.DataFrame(fake_test)
        y_predict = model.predict(X_test)

        # The edge case of predicting a single value
        if isinstance(y_predict, float):
            y_predict = np.array([y_predict])

        real_score = Metrics(metrics_list=metrics).evaluate(y_true=y_test, y_pred=y_predict)
        naive_score = Metrics(metrics_list=metrics).evaluate(y_true=y_test, y_pred=fake_test)
        return self.to_excel(real_score, naive_score)


[docs]    def test_nearest_neighbour_cdist(self, X_train, X_test, y_train, y_test, model, metrics = ["mean_absolute_error"], d_metric = "euclidean"):

        result = cdist(X_test, X_train, metric = d_metric)
        fake_test = []

        #Get the index of the nearest neighbour (i.e shortest distance) from the result
        for i in range(len(X_test)):
            nn_index = result[i].tolist().index(result[i].min())
            # Use the nearest neighbour datapoint's y_test instead of the actual y_test
            fake_test.append(y_train[nn_index])

        y_predict = model.predict(X_test)

        # The edge case of predicting a single value
        if isinstance(y_predict, float):
            y_predict = np.array([y_predict])

        real_score = Metrics(metrics_list=metrics).evaluate(y_true=y_test, y_pred=y_predict)
        naive_score = Metrics(metrics_list=metrics).evaluate(y_true=fake_test, y_pred=y_predict)
        return self.to_excel(real_score, naive_score)

[docs]    def test_classifier_random(self, X_train, X_test, y_train, y_test, model, metrics = ["mean_absolute_error"]):

        # Get the number of classes in the data and randomly pick one
        n_classes = np.unique(y_train).size
        constant = np.random.randint(0, n_classes)
        arr = [constant for i in range(len(y_test))]

        fake_test = pd.Series(arr)
        y_predict = model.predict(X_test)

        real_score = Metrics(metrics_list=metrics).evaluate(y_true=y_test, y_pred=y_predict)
        naive_score = Metrics(metrics_list=metrics).evaluate(y_true=fake_test, y_pred=y_predict)
        return self.to_excel(real_score, naive_score)

[docs]    def test_classifier_dominant(self, X_train, X_test, y_train, y_test, model, metrics=["mean_absolute_error"]):

        # Choose the class with the highest number of count
        # If there are classes with equal count, the first one will be chosen
        k = y_train.value_counts()
        theMax = k.max()
        dominant_index = k.tolist().index(theMax)
        constant = np.random.randint(k[dominant_index])
        arr = [constant for i in range(len(y_test))]

        fake_test = pd.Series(arr)
        y_predict = model.predict(X_test)

        real_score = Metrics(metrics_list=metrics).evaluate(y_true=y_test, y_pred=y_predict)
        naive_score = Metrics(metrics_list=metrics).evaluate(y_true=fake_test, y_pred=y_predict)
        return self.to_excel(real_score, naive_score)


[docs]    def to_excel(self, real_score, naive_score):
        toExcel = []
        for (k,v), (k2,v2) in zip(real_score.items(), naive_score.items()):
            # print(k , "score:")
            # print("Real:", v)
            # print("Fake:", v2, "\n")
            toExcel.append((k, v, v2))

        return pd.DataFrame(toExcel)