"""
The data_cleaner module is used to clean missing or NaN values from pandas dataframes (e.g. removing NaN, imputation, etc.)
"""
import pandas as pd
import numpy as np
import logging
from sklearn.impute import SimpleImputer
import os
from scipy.linalg import orth
log = logging.getLogger('mastml')
[docs]def flag_outliers(df, conf_not_input_features, savepath, n_stdevs=3):
"""
Method that scans values in each X feature matrix column and flags values that are larger than 3 standard deviations
from the average of that column value. The index and column values of potentially problematic points are listed and
written to an output file.
Args:
df: (dataframe), pandas dataframe containing data
Returns:
None, just writes results to file
"""
n_rows = df.shape[0]
outlier_dict = dict()
for col in df.columns:
outlier_rows = list()
outlier_vals = list()
if col not in conf_not_input_features:
avg = np.average(df[col])
stdev = np.std(df[col])
for row in range(n_rows):
if df[col].iloc[row] > avg + n_stdevs*stdev:
outlier_rows.append(row)
outlier_vals.append(df[col].iloc[row])
elif df[col].iloc[row] < avg - n_stdevs*stdev:
outlier_rows.append(row)
outlier_vals.append(df[col].iloc[row])
else:
pass
outlier_dict[col] = (outlier_rows, outlier_vals)
pd.DataFrame().from_dict(data=outlier_dict,orient='index', columns=['Indices', 'Values']).to_excel(os.path.join(savepath,'data_potential_outliers.xlsx'))
return
[docs]def remove(df, axis):
"""
Method that removes a full column or row of data values if one column or row contains NaN or is blank
Args:
df: (dataframe), pandas dataframe containing data
axis: (int), whether to remove rows (axis=0) or columns (axis=1)
Returns:
df: (dataframe): dataframe with NaN or missing values removed
"""
df_nan = df[pd.isnull(df)]
nan_indices = df_nan.index
df = df.dropna(axis=axis, how='any')
return df, nan_indices
[docs]def imputation(df, strategy, cols_to_leave_out=None):
"""
Method that imputes values to the missing places based on the median, mean, etc. of the data in the column
Args:
df: (dataframe), pandas dataframe containing data
strategy: (str), method of imputation, e.g. median, mean, etc.
cols_to_leave_out: (list), list of column indices to not include in imputation
Returns:
df: (dataframe): dataframe with NaN or missing values resolved via imputation
"""
col_names = df.columns.tolist()
if cols_to_leave_out is None:
df_imputed = pd.DataFrame(SimpleImputer(missing_values=np.nan, strategy=strategy).fit_transform(df))
else:
df_include = df.drop(cols_to_leave_out, axis=1)
df_hold_out = df.drop([c for c in df.columns if c not in cols_to_leave_out], axis=1)
df_imputed = pd.DataFrame(SimpleImputer(missing_values=np.nan, strategy=strategy).fit_transform(df_include), columns=df_include.columns)
# Need to join the imputed dataframe with the columns containing strings that were held out
if cols_to_leave_out is None:
df = df_imputed
else:
df = pd.concat([df_hold_out, df_imputed], axis=1)
col_names = df.columns.tolist()
return df
[docs]def ppca(df, cols_to_leave_out=None):
"""
Method that performs a recursive PCA routine to use PCA of known columns to fill in missing values in particular column
Args:
df: (dataframe), pandas dataframe containing data
cols_to_leave_out: (list), list of column indices to not include in imputation
Returns:
df: (dataframe): dataframe with NaN or missing values resolved via imputation
"""
col_names = df.columns.tolist()
pca_magic = PPCA()
if cols_to_leave_out is None:
pca_magic.fit(np.array(df))
else:
pca_magic.fit(np.array(df.drop(cols_to_leave_out, axis=1)))
# Need to un-standardize the pca-transformed data
df_ppca = pd.DataFrame(pca_magic.data*pca_magic.stds+pca_magic.means)
if cols_to_leave_out is None:
df = df_ppca
else:
df = pd.concat([df_ppca, df[cols_to_leave_out]], axis=1)
df.columns = col_names
return df
[docs]def columns_with_strings(df):
"""
Method that ascertains which columns in data contain string entries
Args:
df: (dataframe), pandas dataframe containing data
Returns:
str_columns: (list), list containing indices of columns containing strings
"""
str_summary = pd.DataFrame(df.applymap(type).eq(str).any())
str_columns = str_summary.index[str_summary[0] == True].tolist()
return str_columns
[docs]class PPCA():
"""
Class to perform probabilistic principal component analysis (PPCA) to fill in missing data.
This PPCA routine was taken directly from https://github.com/allentran/pca-magic. Due to import errors, for ease of use
we have elected to copy the module here. This github repo was last accessed on 8/27/18. The code comprising the PPCA
class below was not developed by and is not owned by the University of Wisconsin-Madison MAST-ML development team.
"""
def __init__(self):
self.raw = None
self.data = None
self.C = None
self.means = None
self.stds = None
self.eig_vals = None
def _standardize(self, X):
if self.means is None or self.stds is None:
raise RuntimeError("Fit model first")
return (X - self.means) / self.stds
[docs] def fit(self, data, d=None, tol=1e-4, min_obs=10, verbose=False):
self.raw = data
self.raw[np.isinf(self.raw)] = np.max(self.raw[np.isfinite(self.raw)])
valid_series = np.sum(~np.isnan(self.raw), axis=0) >= min_obs
data = self.raw[:, valid_series].copy()
N = data.shape[0]
D = data.shape[1]
self.means = np.nanmean(data, axis=0)
self.stds = np.nanstd(data, axis=0)
data = self._standardize(data)
observed = ~np.isnan(data)
missing = np.sum(~observed)
data[~observed] = 0
# initial
if d is None:
d = data.shape[1]
if self.C is None:
C = np.random.randn(D, d)
else:
C = self.C
CC = np.dot(C.T, C)
X = np.dot(np.dot(data, C), np.linalg.inv(CC))
recon = np.dot(X, C.T)
recon[~observed] = 0
ss = np.sum((recon - data) ** 2) / (N * D - missing)
v0 = np.inf
counter = 0
while True:
Sx = np.linalg.inv(np.eye(d) + CC / ss)
# e-step
ss0 = ss
if missing > 0:
proj = np.dot(X, C.T)
data[~observed] = proj[~observed]
X = np.dot(np.dot(data, C), Sx) / ss
# m-step
XX = np.dot(X.T, X)
C = np.dot(np.dot(data.T, X), np.linalg.pinv(XX + N * Sx))
CC = np.dot(C.T, C)
recon = np.dot(X, C.T)
recon[~observed] = 0
ss = (np.sum((recon - data) ** 2) + N * np.sum(CC * Sx) + missing * ss0) / (N * D)
# calc diff for convergence
det = np.log(np.linalg.det(Sx))
if np.isinf(det):
det = abs(np.linalg.slogdet(Sx)[1])
v1 = N * (D * np.log(ss) + np.trace(Sx) - det) \
+ np.trace(XX) - missing * np.log(ss0)
diff = abs(v1 / v0 - 1)
if verbose:
print(diff)
if (diff < tol) and (counter > 5):
break
counter += 1
v0 = v1
C = orth(C)
vals, vecs = np.linalg.eig(np.cov(np.dot(data, C).T))
order = np.flipud(np.argsort(vals))
vecs = vecs[:, order]
vals = vals[order]
C = np.dot(C, vecs)
# attach objects to class
self.C = C
self.data = data
self.eig_vals = vals
self._calc_var()
def _calc_var(self):
if self.data is None:
raise RuntimeError('Fit the data model first.')
data = self.data.T
# variance calc
var = np.nanvar(data, axis=1)
total_var = var.sum()
self.var_exp = self.eig_vals.cumsum() / total_var
[docs] def save(self, fpath):
np.save(fpath, self.C)
[docs] def load(self, fpath):
assert os.path.isfile(fpath)
self.C = np.load(fpath)