Source code for mastml.preprocessing

"""
This module contains methods to perform data preprocessing, such as various standardization/normalization methods

BasePreprocessor:
    Base class that adds some MAST-ML type functionality to other preprocessors. Other preprocessor classes all inherit
    this base class

SklearnPreprocessor:
    Class that wraps any preprocessor method from scikit-learn (e.g. StandardScaler) to have MAST-ML type functionality

NoPreprocessor:
    Class that performs no preprocessing. A preprocessor is needed in the MAST-ML evaluation of data splits. If no
    preprocessing is desired, then this NoPreprocessor class is invoked by default

MeanStdevScaler:
    Preprocessor class which extends scikit-learn's StandardScaler to scale the dataset to a particular user-specified
    mean and standard deviation value
"""

import sklearn.preprocessing
import pandas as pd
import os
import numpy as np
from pprint import pprint
import inspect
from datetime import datetime
import joblib

from sklearn.base import BaseEstimator, TransformerMixin

[docs]class BasePreprocessor(BaseEstimator, TransformerMixin): """ Base class to provide new methods beyond sklearn fit_transform, such as dataframe support and directory management Args: preprocessor : a sklearn.preprocessor object, e.g. StandardScaler or mastml.preprocessing object Methods: fit_transform: method that fits the data to the preprocessor, then transforms it to the preprocessed data Args: X: (pd.DataFrame), dataframe of X features y: (pd.Series), series of y target data Returns: Transformed data (pd.DataFrame or numpy array based on self.as_frame) evaluate: main method to evaluate a preprocessor, build directory and save data output Args: X: (pd.DataFrame), dataframe of X features y: (pd.Series), series of y target data savepath: (str), string containing main savepath to construct splits for saving output file_extension: (str), must be either '.xlsx' or '.csv', determines data file type for saving Returns: Xnew (pd.DataFrame or numpy array), dataframe or array of the preprocessed X features help: method to output key information on class use, e.g. methods and parameters Args: None Returns: None, but outputs help to screen _setup_savedir: method to create a savedir based on the provided model, splitter, selector names and datetime Args: model: (mastml.models.SklearnModel or other estimator object), an estimator, e.g. KernelRidge selector: (mastml.feature_selectors or other selector object), a selector, e.g. EnsembleModelFeatureSelector savepath: (str), string designating the savepath Returns: splitdir: (str), string containing the new subdirectory to save results to """ def __init__(self, preprocessor, as_frame=False): self.preprocessor = preprocessor self.as_frame = as_frame
[docs] def fit(self, X): return self.preprocessor.fit(X)
[docs] def transform(self, X): if self.as_frame: return pd.DataFrame(self.preprocessor.transform(X=X), columns=X.columns, index=X.index) return self.preprocessor.transform(X=X)
[docs] def inverse_transform(self, X): return pd.DataFrame(self.preprocessor.inverse_transform(X), columns=X.columns, index=X.index)
[docs] def fit_transform(self, X, y=None, **fit_params): if self.as_frame: return pd.DataFrame(self.preprocessor.fit_transform(X=X), columns=X.columns, index=X.index) return self.preprocessor.fit_transform(X=X)
[docs] def evaluate(self, X, y=None, savepath=None, file_name='', make_new_dir=False, file_extension='.csv'): if not savepath: savepath = os.getcwd() if make_new_dir is True: splitdir = self._setup_savedir(savepath=savepath) self.splitdir = splitdir savepath = splitdir if self.as_frame: Xnew = pd.DataFrame(self.preprocessor.fit_transform(X=X), columns=X.columns, index=X.index) if file_extension == '.xlsx': Xnew.to_excel(os.path.join(savepath, 'data_preprocessed_'+file_name+'.xlsx')) elif file_extension == '.csv': Xnew.to_csv(os.path.join(savepath, 'data_preprocessed_' + file_name + '.csv')) else: Xnew = self.preprocessor.fit_transform(X=X) np.savetxt(os.path.join(savepath, 'data_preprocessed_'+file_name+'.csv'), Xnew) # Save the fitted preprocessor, will be needed for DLHub upload later on joblib.dump(self, os.path.join(savepath, str(self.preprocessor.__class__.__name__) + ".pkl")) self.savepath = savepath return Xnew
[docs] def help(self): print('Documentation for', self.preprocessor) pprint(dict(inspect.getmembers(self.preprocessor))['__doc__']) print('\n') print('Class methods for,', self.preprocessor) pprint(dict(inspect.getmembers(self.preprocessor, predicate=inspect.ismethod))) print('\n') print('Class attributes for,', self.preprocessor) pprint(self.preprocessor.__dict__) return
def _setup_savedir(self, savepath): now = datetime.now() dirname = self.preprocessor.__class__.__name__ dirname = f"{dirname}_{now.year:02d}_{now.month:02d}_{now.day:02d}" \ f"_{now.hour:02d}_{now.minute:02d}_{now.second:02d}" if savepath == None: splitdir = os.getcwd() else: splitdir = os.path.join(savepath, dirname) if not os.path.exists(splitdir): os.mkdir(splitdir) return splitdir
[docs]class SklearnPreprocessor(BasePreprocessor): """ Class to wrap any scikit-learn preprocessor, e.g. StandardScaler Args: preprocessor (str): name of a sklearn.preprocessor object, e.g. StandardScaler as_frame (bool): whether to return data as a dataframe kwargs : key word arguments for the sklearn.preprocessor object Methods: See documentation of BasePreprocessor """ def __init__(self, preprocessor, as_frame=False, **kwargs): super(SklearnPreprocessor, self).__init__(preprocessor=preprocessor) self.preprocessor = getattr(sklearn.preprocessing, preprocessor)(**kwargs) self.as_frame = as_frame
[docs]class NoPreprocessor(BasePreprocessor): ''' Class for having a "null" transform where the output is the same as the input. Needed by MAST-ML as a placeholder if certain workflow aspects are not performed. See BasePreprocessor for information on args and methods ''' def __init__(self, preprocessor=None, as_frame=False): super(NoPreprocessor, self).__init__(preprocessor=self) self.as_frame = as_frame
[docs] def fit(self, X): return X
[docs] def transform(self, X): if self.as_frame: return pd.DataFrame(X, columns=X.columns, index=X.index) return X
[docs] def fit_transform(self, X, y=None, **fit_params): if self.as_frame: return pd.DataFrame(X, columns=X.columns, index=X.index) return X
[docs]class MeanStdevScaler(BasePreprocessor): """ Class designed to normalize input data to a specified mean and standard deviation Args: mean: (int/float), specified normalized mean of the data stdev: (int/float), specified normalized standard deviation of the data Methods: fit: Obtains initial mean and stdev of data Args: df: (dataframe), dataframe of values to be normalized Returns: (self, the object instance) transform: Normalizes the data to new mean and stdev values Args: df: (dataframe), dataframe of values to be normalized Returns: (dataframe), dataframe containing re-normalized data and any data that wasn't normalized inverse_transform: Un-normalizes the data to the old mean and stdev values Args: df: (dataframe), dataframe of values to be un-normalized Returns: (dataframe), dataframe containing un-normalized data and any data that wasn't normalized """ def __init__(self, mean=0, stdev=1, as_frame=False): super(MeanStdevScaler, self).__init__(preprocessor=self, as_frame=as_frame) self.mean = mean self.stdev = stdev
[docs] def fit_transform(self, X, y=None, **fit_params): self.features = X.columns.tolist() X_trans = list() for feature in self.features: array = X[feature].values mean = X[feature].mean() stdev = X[feature].std() array = ((array - mean) / stdev) * self.stdev + self.mean X_trans.append(array) X_trans = pd.DataFrame(np.array(X_trans).T, columns=self.features, index=X.index) return X_trans