Source code for mastml.data_cleaner

"""
The data_cleaner module is used to clean missing or NaN values from pandas dataframes (e.g. removing NaN, imputation, etc.)
"""

import pandas as pd
import numpy as np
import logging
from sklearn.impute import SimpleImputer

import os
from scipy.linalg import orth

log = logging.getLogger('mastml')

[docs]def flag_outliers(df, conf_not_input_features, savepath, n_stdevs=3): """ Method that scans values in each X feature matrix column and flags values that are larger than 3 standard deviations from the average of that column value. The index and column values of potentially problematic points are listed and written to an output file. Args: df: (dataframe), pandas dataframe containing data Returns: None, just writes results to file """ n_rows = df.shape[0] outlier_dict = dict() for col in df.columns: outlier_rows = list() outlier_vals = list() if col not in conf_not_input_features: avg = np.average(df[col]) stdev = np.std(df[col]) for row in range(n_rows): if df[col].iloc[row] > avg + n_stdevs*stdev: outlier_rows.append(row) outlier_vals.append(df[col].iloc[row]) elif df[col].iloc[row] < avg - n_stdevs*stdev: outlier_rows.append(row) outlier_vals.append(df[col].iloc[row]) else: pass outlier_dict[col] = (outlier_rows, outlier_vals) pd.DataFrame().from_dict(data=outlier_dict,orient='index', columns=['Indices', 'Values']).to_excel(os.path.join(savepath,'data_potential_outliers.xlsx')) return
[docs]def remove(df, axis): """ Method that removes a full column or row of data values if one column or row contains NaN or is blank Args: df: (dataframe), pandas dataframe containing data axis: (int), whether to remove rows (axis=0) or columns (axis=1) Returns: df: (dataframe): dataframe with NaN or missing values removed """ df_nan = df[pd.isnull(df)] nan_indices = df_nan.index df = df.dropna(axis=axis, how='any') return df, nan_indices
[docs]def imputation(df, strategy, cols_to_leave_out=None): """ Method that imputes values to the missing places based on the median, mean, etc. of the data in the column Args: df: (dataframe), pandas dataframe containing data strategy: (str), method of imputation, e.g. median, mean, etc. cols_to_leave_out: (list), list of column indices to not include in imputation Returns: df: (dataframe): dataframe with NaN or missing values resolved via imputation """ col_names = df.columns.tolist() if cols_to_leave_out is None: df_imputed = pd.DataFrame(SimpleImputer(missing_values=np.nan, strategy=strategy).fit_transform(df)) else: df_include = df.drop(cols_to_leave_out, axis=1) df_hold_out = df.drop([c for c in df.columns if c not in cols_to_leave_out], axis=1) df_imputed = pd.DataFrame(SimpleImputer(missing_values=np.nan, strategy=strategy).fit_transform(df_include), columns=df_include.columns) # Need to join the imputed dataframe with the columns containing strings that were held out if cols_to_leave_out is None: df = df_imputed else: df = pd.concat([df_hold_out, df_imputed], axis=1) col_names = df.columns.tolist() return df
[docs]def ppca(df, cols_to_leave_out=None): """ Method that performs a recursive PCA routine to use PCA of known columns to fill in missing values in particular column Args: df: (dataframe), pandas dataframe containing data cols_to_leave_out: (list), list of column indices to not include in imputation Returns: df: (dataframe): dataframe with NaN or missing values resolved via imputation """ col_names = df.columns.tolist() pca_magic = PPCA() if cols_to_leave_out is None: pca_magic.fit(np.array(df)) else: pca_magic.fit(np.array(df.drop(cols_to_leave_out, axis=1))) # Need to un-standardize the pca-transformed data df_ppca = pd.DataFrame(pca_magic.data*pca_magic.stds+pca_magic.means) if cols_to_leave_out is None: df = df_ppca else: df = pd.concat([df_ppca, df[cols_to_leave_out]], axis=1) df.columns = col_names return df
[docs]def columns_with_strings(df): """ Method that ascertains which columns in data contain string entries Args: df: (dataframe), pandas dataframe containing data Returns: str_columns: (list), list containing indices of columns containing strings """ str_summary = pd.DataFrame(df.applymap(type).eq(str).any()) str_columns = str_summary.index[str_summary[0] == True].tolist() return str_columns
[docs]class PPCA(): """ Class to perform probabilistic principal component analysis (PPCA) to fill in missing data. This PPCA routine was taken directly from https://github.com/allentran/pca-magic. Due to import errors, for ease of use we have elected to copy the module here. This github repo was last accessed on 8/27/18. The code comprising the PPCA class below was not developed by and is not owned by the University of Wisconsin-Madison MAST-ML development team. """ def __init__(self): self.raw = None self.data = None self.C = None self.means = None self.stds = None self.eig_vals = None def _standardize(self, X): if self.means is None or self.stds is None: raise RuntimeError("Fit model first") return (X - self.means) / self.stds
[docs] def fit(self, data, d=None, tol=1e-4, min_obs=10, verbose=False): self.raw = data self.raw[np.isinf(self.raw)] = np.max(self.raw[np.isfinite(self.raw)]) valid_series = np.sum(~np.isnan(self.raw), axis=0) >= min_obs data = self.raw[:, valid_series].copy() N = data.shape[0] D = data.shape[1] self.means = np.nanmean(data, axis=0) self.stds = np.nanstd(data, axis=0) data = self._standardize(data) observed = ~np.isnan(data) missing = np.sum(~observed) data[~observed] = 0 # initial if d is None: d = data.shape[1] if self.C is None: C = np.random.randn(D, d) else: C = self.C CC = np.dot(C.T, C) X = np.dot(np.dot(data, C), np.linalg.inv(CC)) recon = np.dot(X, C.T) recon[~observed] = 0 ss = np.sum((recon - data) ** 2) / (N * D - missing) v0 = np.inf counter = 0 while True: Sx = np.linalg.inv(np.eye(d) + CC / ss) # e-step ss0 = ss if missing > 0: proj = np.dot(X, C.T) data[~observed] = proj[~observed] X = np.dot(np.dot(data, C), Sx) / ss # m-step XX = np.dot(X.T, X) C = np.dot(np.dot(data.T, X), np.linalg.pinv(XX + N * Sx)) CC = np.dot(C.T, C) recon = np.dot(X, C.T) recon[~observed] = 0 ss = (np.sum((recon - data) ** 2) + N * np.sum(CC * Sx) + missing * ss0) / (N * D) # calc diff for convergence det = np.log(np.linalg.det(Sx)) if np.isinf(det): det = abs(np.linalg.slogdet(Sx)[1]) v1 = N * (D * np.log(ss) + np.trace(Sx) - det) \ + np.trace(XX) - missing * np.log(ss0) diff = abs(v1 / v0 - 1) if verbose: print(diff) if (diff < tol) and (counter > 5): break counter += 1 v0 = v1 C = orth(C) vals, vecs = np.linalg.eig(np.cov(np.dot(data, C).T)) order = np.flipud(np.argsort(vals)) vecs = vecs[:, order] vals = vals[order] C = np.dot(C, vecs) # attach objects to class self.C = C self.data = data self.eig_vals = vals self._calc_var()
[docs] def transform(self, data=None): if self.C is None: raise RuntimeError('Fit the data model first.') if data is None: return np.dot(self.data, self.C) return np.dot(data, self.C)
def _calc_var(self): if self.data is None: raise RuntimeError('Fit the data model first.') data = self.data.T # variance calc var = np.nanvar(data, axis=1) total_var = var.sum() self.var_exp = self.eig_vals.cumsum() / total_var
[docs] def save(self, fpath): np.save(fpath, self.C)
[docs] def load(self, fpath): assert os.path.isfile(fpath) self.C = np.load(fpath)