"""
This module contains a collection of classes and methods for selecting features, and interfaces with scikit-learn feature
selectors. More information on scikit-learn feature selectors is available at:
http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection
"""
from functools import wraps
import warnings
import numpy as np
from mastml.metrics import root_mean_squared_error
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
import sklearn.feature_selection as fs
from mlxtend.feature_selection import SequentialFeatureSelector
import os, logging
## XIYU's import for PearsonSelector
import copy
from numpy import cov
import xlsxwriter
from scipy.stats import pearsonr
##
log = logging.getLogger('mastml')
from mastml.legos import util_legos
[docs]def dataframify_selector(transform):
"""
Method which transforms output of scikit-learn feature selectors from array to dataframe. Enables preservation of column names.
Args:
transform: (function), a scikit-learn feature selector that has a transform method
Returns:
new_transform: (function), an amended version of the transform method that returns a dataframe
"""
@wraps(transform)
def new_transform(self, df):
if isinstance(df, pd.DataFrame):
return df[df.columns[self.get_support(indices=True)]]
else: # just in case you try to use it with an array ;)
return df
return new_transform
[docs]def dataframify_new_column_names(transform, name):
"""
Method which transforms output of scikit-learn feature selectors to dataframe, and adds column names
Args:
transform: (function), a scikit-learn feature selector that has a transform method
name: (str), name of the feature selector
Returns:
new_transform: (function), an amended version of the transform method that returns a dataframe
"""
def new_transform(self, df):
arr = transform(self, df.values)
labels = [name+str(i) for i in range(arr.shape[1])]
return pd.DataFrame(arr, columns=labels)
return new_transform
[docs]def fitify_just_use_values(fit):
"""
Method which enables a feature selector fit method to operate on dataframes
Args:
fit: (function), a scikit-learn feature selector object with a fit method
Returns:
new_fit: (function), an amended version of the fit method that uses dataframes as input
"""
def new_fit(self, X_df, y_df):
return fit(self, X_df.values, y_df.values)
return new_fit
score_func_selectors = {
'GenericUnivariateSelect': fs.GenericUnivariateSelect, # Univariate feature selector with configurable strategy.
'SelectFdr': fs.SelectFdr, # Filter: Select the p-values for an estimated false discovery rate
'SelectFpr': fs.SelectFpr, # Filter: Select the pvalues below alpha based on a FPR test.
'SelectFwe': fs.SelectFwe, # Filter: Select the p-values corresponding to Family-wise error rate
'SelectKBest': fs.SelectKBest, # Select features according to the k highest scores.
'SelectPercentile': fs.SelectPercentile, # Select features according to a percentile of the highest scores.
}
model_selectors = { # feature selectors which take a model instance as first parameter
'RFE': fs.RFE, # Feature ranking with recursive feature elimination.
'RFECV': fs.RFECV, # Feature ranking with recursive feature elimination and cross-validated selection of the best number of features.
'SelectFromModel': fs.SelectFromModel, # Meta-transformer for selecting features based on importance weights.
}
other_selectors = {
'VarianceThreshold': fs.VarianceThreshold, # Feature selector that removes all low-variance features.
}
# Union together the above dicts for the primary export:
name_to_constructor = dict(**score_func_selectors, **model_selectors, **other_selectors)
# Modify all sklearn transform methods to return dataframes:
for constructor in name_to_constructor.values():
constructor.old_transform = constructor.transform
constructor.transform = dataframify_selector(constructor.transform)
[docs]class EnsembleModelFeatureSelector(object):
"""
Class custom-written for MAST-ML to conduct selection of features with ensemble model feature importances
Args:
estimator: (scikit-learn model/estimator object), a scikit-learn model/estimator
k_features: (int), the number of features to select
Methods:
fit: performs feature selection
Args:
X: (dataframe), dataframe of X features
y: (dataframe), dataframe of y data
Returns:
None
transform: performs the transform to generate output of only selected features
Args:
X: (dataframe), dataframe of X features
Returns:
dataframe: (dataframe), dataframe of selected X features
"""
def __init__(self, estimator, k_features):
self.estimator = estimator
self.k_features = k_features
# Check that a correct model was passed in
self._check_model()
self.selected_features = list()
def _check_model(self):
if self.estimator.__class__.__name__ not in ['RandomForestRegressor', 'ExtraTreesRegressor', 'GradientBoostingRegressor']:
raise ValueError('Models used in EnsembleModelFeatureSelector must be one of RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor')
return
[docs] def fit(self, X, y=None):
feature_importances = self.estimator.fit(X, y).feature_importances_
feature_importance_dict = dict()
for col, f in zip(X.columns.tolist(), feature_importances):
feature_importance_dict[col] = f
feature_importances_sorted = sorted(((f, col) for col, f in feature_importance_dict.items()), reverse=True)
sorted_features_list = [f[1] for f in feature_importances_sorted]
self.selected_features = sorted_features_list[0:self.k_features]
return self
[docs]class PearsonSelector(object):
"""
Class custom-written for MAST-ML to conduct selection of features based on Pearson correlation coefficent between
features and target. Can also be used for dimensionality reduction by removing redundant features highly correlated
with each other.
Args:
threshold_between_features: (float), the threshold to decide whether redundant features are removed. Should be
a decimal value between 0 and 1. Only used if remove_highly_correlated_features is True
threshold_with_target: (float), the threshold to decide whether a given feature is sufficiently correlated with
the target feature and thus kept as a selected feature. Should be a decimal value between 0 and 1.
remove_highly_correlated_features: (bool), whether to remove features highly correlated with each other
k_features: (int), the number of features to select
Methods:
fit: performs feature selection
Args:
X: (dataframe), dataframe of X features
y: (dataframe), dataframe of y data
Returns:
None
transform: performs the transform to generate output of only selected features
Args:
X: (dataframe), dataframe of X features
Returns:
dataframe: (dataframe), dataframe of selected X features
"""
def __init__(self, threshold_between_features, threshold_with_target, remove_highly_correlated_features, k_features):
self.threshold_between_features = threshold_between_features
self.threshold_with_target = threshold_with_target
self.remove_highly_correlated_features = remove_highly_correlated_features
self.k_features = k_features
self.selected_features = list()
[docs] def fit(self, X, savepath, y=None, Xgroups=None):
df = X
df_features = df.columns.tolist()
n_col = df.shape[1]
if self.remove_highly_correlated_features == True:
array_data = list()
for i in range(n_col):
col_data = df.iloc[:, i]
col = list()
for j in range(n_col):
row_data = df.iloc[:, j]
corr, _ = pearsonr(row_data, col_data) # Pearson Correlation
col.append(corr)
array_data.append(col)
array_df = pd.DataFrame(array_data, index=df_features[:n_col], columns=df_features[:n_col])
array_df.to_excel(os.path.join(savepath, 'Full_correlation_matrix.xlsx'))
#### Print features highly-correlated to each other into excel
hcorr = dict()
highly_correlated_features = list()
for i in range(len(array_df.iloc[0, :])): # This includes all the data in the array_df
# feature1 = array_df.iloc[:, i] # This does not work because feature1 is not the col name but a list of values
feature1 = array_df.columns[i]
for j in range(len(array_df.iloc[0, :])): # This includes all the data in the array_df
# feature2 = array_df.iloc[:, j] # This does not work because feature2 is not the col name but a list of values
feature2 = array_df.columns[j]
if abs(array_df.iloc[i - 1, j - 1]) >= np.float64(self.threshold_between_features):
if i != j: # Ignore diagonal features
if not (feature2, feature1) in hcorr: # Ignore the same correlations
hcorr[(feature1, feature2)] = array_df.iloc[i - 1, j - 1]
highly_correlated_features.append(feature2)
hcorr_df = pd.DataFrame(hcorr, index=["Corr"])
hcorr_df.to_excel(os.path.join(savepath, 'Highly_correlated_features.xlsx'))
highly_correlated_features = list(np.unique(np.array(highly_correlated_features)))
#### Print the removed features and the new smaller dataframe with features removed
# Deep copy the original dataframe
removed_features_df = copy.deepcopy(X)
new_df = copy.deepcopy(X)
# Drop the features that can be removed
all_features = list(new_df.columns)
removed_features = list()
for feature in all_features:
if feature not in highly_correlated_features:
removed_features.append(feature)
new_df = new_df.drop(columns=feature)
# Print the highly correlated features that were removed
for feature in all_features:
if feature not in removed_features:
removed_features_df = removed_features_df.drop(columns=feature)
removed_features_df.to_excel(os.path.join(savepath, "Highly_correlated_features_removed.xlsx"),
index=False)
# Define self.selected_features
remaining_features = list(new_df.columns)
else:
remaining_features = list(df.columns)
# Compute Pearson correlations between each feature and target feature
all_corrs = {}
for i in range(len(remaining_features)):
feature_name = df.columns[i]
feature_data = df.iloc[:, i]
corr, _ = pearsonr(y, feature_data)
all_corrs[feature_name] = corr
all_corrs = abs(pd.Series(all_corrs))
self.selected_features = list(all_corrs[all_corrs > self.threshold_with_target].sort_values(
ascending=False).keys())
# Sometimes the specificed threshold is too high. Make it lower until at least 1 feature is selected
while len(self.selected_features) < self.k_features:
log.debug('WARNING: Pearson selector threshold was too high to result in selecting any features, lowering threshold to get specified feature number')
self.threshold_with_target -= 0.05
self.selected_features = list(all_corrs[all_corrs > self.threshold_with_target].sort_values(
ascending=False).keys())
if len(self.selected_features) == n_col:
log.debug('WARNING: Pearson selector reduce the threshold such that all features were included')
break
log.debug('Pearson selector selected features with an adjusted threshold value')
if len(self.selected_features) > self.k_features:
self.selected_features = list(all_corrs[all_corrs > self.threshold_with_target].sort_values(ascending=False).keys())[:self.k_features]
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(os.path.join(savepath, 'Features_highly_correlated_with_target.xlsx'), engine='xlsxwriter')
# Create the dataframe displaying the highly correlated features and the Pearson Correlations
hcorr_with_target_df = pd.DataFrame(all_corrs,
index=list(all_corrs[all_corrs > self.threshold_with_target].sort_values(
ascending=False).keys()),
columns=["Pearson Correlation (absolute value)"])
hcorr_with_target_df.to_excel(writer, sheet_name='Sheet1', index=True)
hcorr_with_target_features = list(hcorr_with_target_df.index)
# Create dataframe containing the highly correlated features
all_features = list(df.columns)
for feature in all_features:
if not feature in hcorr_with_target_features:
df = df.drop(columns=feature)
# Reorder the dataframe by columns (by their correlation to the target feature)
df = df.reindex(columns=hcorr_with_target_features)
# Print the dataframe to a spreadsheet
df.to_excel(writer, sheet_name='Sheet2',
index=False) # From left to right, the strength of correlation decreases.
# Close the Pandas Excel writer and output the Excel file.
writer.save()
return self
[docs]class MASTMLFeatureSelector(object):
"""
Class custom-written for MAST-ML to conduct forward selection of features with flexible model and cv scheme
Args:
estimator: (scikit-learn model/estimator object), a scikit-learn model/estimator
n_features_to_select: (int), the number of features to select
cv: (scikit-learn cross-validation object), a scikit-learn cross-validation object
manually_selected_features: (list), a list of features manually set by the user. The feature selector will first
start from this list of features and sequentially add features until n_features_to_select is met.
Methods:
fit: performs feature selection
Args:
X: (dataframe), dataframe of X features
y: (dataframe), dataframe of y data
Xgroups: (dataframe), dataframe of group labels
Returns:
None
transform: performs the transform to generate output of only selected features
Args:
X: (dataframe), dataframe of X features
Returns:
dataframe: (dataframe), dataframe of selected X features
"""
def __init__(self, estimator, n_features_to_select, cv, manually_selected_features=list()):
self.estimator = estimator
self.n_features_to_select = n_features_to_select
self.cv = cv
self.manually_selected_features = manually_selected_features
self.selected_feature_names = self.manually_selected_features
[docs] def fit(self, X, y, savepath, Xgroups=None):
if Xgroups.shape[0] == 0:
xgroups = np.zeros(len(y))
Xgroups = pd.DataFrame(xgroups)
selected_feature_avg_rmses = list()
selected_feature_std_rmses = list()
basic_forward_selection_dict = dict()
num_features_selected = 0
x_features = X.columns.tolist()
if self.n_features_to_select >= len(x_features):
self.n_features_to_select = len(x_features)
while num_features_selected < self.n_features_to_select:
log.info('On number of features selected')
log.info(str(num_features_selected))
# Catch pandas warnings here
with warnings.catch_warnings():
warnings.simplefilter('ignore')
ranked_features = self._rank_features(X=X, y=y, groups=Xgroups)
top_feature_name, top_feature_avg_rmse, top_feature_std_rmse = self._choose_top_feature(ranked_features=ranked_features)
self.selected_feature_names.append(top_feature_name)
if len(self.selected_feature_names) > 0:
log.info('selected features')
log.info(self.selected_feature_names)
selected_feature_avg_rmses.append(top_feature_avg_rmse)
selected_feature_std_rmses.append(top_feature_std_rmse)
basic_forward_selection_dict[str(num_features_selected)] = dict()
basic_forward_selection_dict[str(num_features_selected)][
'Number of features selected'] = num_features_selected + 1
basic_forward_selection_dict[str(num_features_selected)][
'Top feature added this iteration'] = top_feature_name
basic_forward_selection_dict[str(num_features_selected)][
'Avg RMSE using top features'] = top_feature_avg_rmse
basic_forward_selection_dict[str(num_features_selected)][
'Stdev RMSE using top features'] = top_feature_std_rmse
# Save for every loop of selecting features
pd.DataFrame(basic_forward_selection_dict).to_csv(os.path.join(savepath,'MASTMLFeatureSelector_data_feature_'+str(num_features_selected)+'.csv'))
num_features_selected += 1
basic_forward_selection_dict[str(self.n_features_to_select - 1)][
'Full feature set Names'] = self.selected_feature_names
basic_forward_selection_dict[str(self.n_features_to_select - 1)][
'Full feature set Avg RMSEs'] = selected_feature_avg_rmses
basic_forward_selection_dict[str(self.n_features_to_select - 1)][
'Full feature set Stdev RMSEs'] = selected_feature_std_rmses
#self._plot_featureselected_learningcurve(selected_feature_avg_rmses=selected_feature_avg_rmses,
# selected_feature_std_rmses=selected_feature_std_rmses)
return self
def _rank_features(self, X, y, groups):
y = np.array(y).reshape(-1, 1)
ranked_features = dict()
trains_metrics = list()
tests_metrics = list()
if groups is not None:
groups = groups.iloc[:,0].tolist()
for col in X.columns:
if col not in self.selected_feature_names:
X_ = X.loc[:, self.selected_feature_names]
X__ = X.loc[:, col]
X_ = np.array(pd.concat([X_, X__], axis=1))
for trains, tests in self.cv.split(X_, y, groups):
self.estimator.fit(X_[trains], y[trains])
predict_tests = self.estimator.predict(X_[tests])
tests_metrics.append(root_mean_squared_error(y[tests], predict_tests))
avg_rmse = np.mean(tests_metrics)
std_rmse = np.std(tests_metrics)
ranked_features[col] = {"avg_rmse": avg_rmse, "std_rmse": std_rmse}
return ranked_features
def _choose_top_feature(self, ranked_features):
feature_names = list()
feature_avg_rmses = list()
feature_std_rmses = list()
feature_names_sorted = list()
feature_std_rmses_sorted = list()
# Make dict of ranked features into list for sorting
for k, v in ranked_features.items():
feature_names.append(k)
for kk, vv in v.items():
if kk == 'avg_rmse':
feature_avg_rmses.append(vv)
if kk == 'std_rmse':
feature_std_rmses.append(vv)
# Sort feature lists so RMSE goes from min to max
feature_avg_rmses_sorted = sorted(feature_avg_rmses)
for feature_avg_rmse in feature_avg_rmses_sorted:
for k, v in ranked_features.items():
if v['avg_rmse'] == feature_avg_rmse:
feature_names_sorted.append(k)
feature_std_rmses_sorted.append(v['std_rmse'])
top_feature_name = feature_names_sorted[0]
top_feature_avg_rmse = feature_avg_rmses_sorted[0]
top_feature_std_rmse = feature_std_rmses_sorted[0]
return top_feature_name, top_feature_avg_rmse, top_feature_std_rmse
def _get_featureselected_dataframe(self, X, selected_feature_names):
# Return dataframe containing only selected features
X_selected = X.loc[:, selected_feature_names]
return X_selected
# Include Principal Component Analysis
PCA.transform = dataframify_new_column_names(PCA.transform, 'pca_')
# Include Sequential Forward Selector
SequentialFeatureSelector.transform = dataframify_new_column_names(SequentialFeatureSelector.transform, 'sfs_')
SequentialFeatureSelector.fit = fitify_just_use_values(SequentialFeatureSelector.fit)
model_selectors['SequentialFeatureSelector'] = SequentialFeatureSelector
name_to_constructor['SequentialFeatureSelector'] = SequentialFeatureSelector
# Custom selectors don't need to be dataframified
name_to_constructor.update({
#'PassThrough': PassThrough,
'DoNothing': util_legos.DoNothing,
'PCA': PCA,
'SequentialFeatureSelector': SequentialFeatureSelector,
'MASTMLFeatureSelector' : MASTMLFeatureSelector,
'PearsonSelector': PearsonSelector,
'EnsembleModelFeatureSelector': EnsembleModelFeatureSelector
})