Source code for mastml.legos.feature_selectors

"""
This module contains a collection of classes and methods for selecting features, and interfaces with scikit-learn feature
selectors. More information on scikit-learn feature selectors is available at:

http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection
"""

from functools import wraps
import warnings
import numpy as np
from mastml.metrics import root_mean_squared_error

import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
import sklearn.feature_selection as fs
from mlxtend.feature_selection import SequentialFeatureSelector
import os, logging

## XIYU's import for PearsonSelector
import copy
from numpy import cov
import xlsxwriter
from scipy.stats import pearsonr
##

log = logging.getLogger('mastml')

from mastml.legos import util_legos

[docs]def dataframify_selector(transform): """ Method which transforms output of scikit-learn feature selectors from array to dataframe. Enables preservation of column names. Args: transform: (function), a scikit-learn feature selector that has a transform method Returns: new_transform: (function), an amended version of the transform method that returns a dataframe """ @wraps(transform) def new_transform(self, df): if isinstance(df, pd.DataFrame): return df[df.columns[self.get_support(indices=True)]] else: # just in case you try to use it with an array ;) return df return new_transform
[docs]def dataframify_new_column_names(transform, name): """ Method which transforms output of scikit-learn feature selectors to dataframe, and adds column names Args: transform: (function), a scikit-learn feature selector that has a transform method name: (str), name of the feature selector Returns: new_transform: (function), an amended version of the transform method that returns a dataframe """ def new_transform(self, df): arr = transform(self, df.values) labels = [name+str(i) for i in range(arr.shape[1])] return pd.DataFrame(arr, columns=labels) return new_transform
[docs]def fitify_just_use_values(fit): """ Method which enables a feature selector fit method to operate on dataframes Args: fit: (function), a scikit-learn feature selector object with a fit method Returns: new_fit: (function), an amended version of the fit method that uses dataframes as input """ def new_fit(self, X_df, y_df): return fit(self, X_df.values, y_df.values) return new_fit
score_func_selectors = { 'GenericUnivariateSelect': fs.GenericUnivariateSelect, # Univariate feature selector with configurable strategy. 'SelectFdr': fs.SelectFdr, # Filter: Select the p-values for an estimated false discovery rate 'SelectFpr': fs.SelectFpr, # Filter: Select the pvalues below alpha based on a FPR test. 'SelectFwe': fs.SelectFwe, # Filter: Select the p-values corresponding to Family-wise error rate 'SelectKBest': fs.SelectKBest, # Select features according to the k highest scores. 'SelectPercentile': fs.SelectPercentile, # Select features according to a percentile of the highest scores. } model_selectors = { # feature selectors which take a model instance as first parameter 'RFE': fs.RFE, # Feature ranking with recursive feature elimination. 'RFECV': fs.RFECV, # Feature ranking with recursive feature elimination and cross-validated selection of the best number of features. 'SelectFromModel': fs.SelectFromModel, # Meta-transformer for selecting features based on importance weights. } other_selectors = { 'VarianceThreshold': fs.VarianceThreshold, # Feature selector that removes all low-variance features. } # Union together the above dicts for the primary export: name_to_constructor = dict(**score_func_selectors, **model_selectors, **other_selectors) # Modify all sklearn transform methods to return dataframes: for constructor in name_to_constructor.values(): constructor.old_transform = constructor.transform constructor.transform = dataframify_selector(constructor.transform)
[docs]class EnsembleModelFeatureSelector(object): """ Class custom-written for MAST-ML to conduct selection of features with ensemble model feature importances Args: estimator: (scikit-learn model/estimator object), a scikit-learn model/estimator k_features: (int), the number of features to select Methods: fit: performs feature selection Args: X: (dataframe), dataframe of X features y: (dataframe), dataframe of y data Returns: None transform: performs the transform to generate output of only selected features Args: X: (dataframe), dataframe of X features Returns: dataframe: (dataframe), dataframe of selected X features """ def __init__(self, estimator, k_features): self.estimator = estimator self.k_features = k_features # Check that a correct model was passed in self._check_model() self.selected_features = list() def _check_model(self): if self.estimator.__class__.__name__ not in ['RandomForestRegressor', 'ExtraTreesRegressor', 'GradientBoostingRegressor']: raise ValueError('Models used in EnsembleModelFeatureSelector must be one of RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor') return
[docs] def fit(self, X, y=None): feature_importances = self.estimator.fit(X, y).feature_importances_ feature_importance_dict = dict() for col, f in zip(X.columns.tolist(), feature_importances): feature_importance_dict[col] = f feature_importances_sorted = sorted(((f, col) for col, f in feature_importance_dict.items()), reverse=True) sorted_features_list = [f[1] for f in feature_importances_sorted] self.selected_features = sorted_features_list[0:self.k_features] return self
[docs] def transform(self, X): df = X[self.selected_features] return df
[docs]class PearsonSelector(object): """ Class custom-written for MAST-ML to conduct selection of features based on Pearson correlation coefficent between features and target. Can also be used for dimensionality reduction by removing redundant features highly correlated with each other. Args: threshold_between_features: (float), the threshold to decide whether redundant features are removed. Should be a decimal value between 0 and 1. Only used if remove_highly_correlated_features is True threshold_with_target: (float), the threshold to decide whether a given feature is sufficiently correlated with the target feature and thus kept as a selected feature. Should be a decimal value between 0 and 1. remove_highly_correlated_features: (bool), whether to remove features highly correlated with each other k_features: (int), the number of features to select Methods: fit: performs feature selection Args: X: (dataframe), dataframe of X features y: (dataframe), dataframe of y data Returns: None transform: performs the transform to generate output of only selected features Args: X: (dataframe), dataframe of X features Returns: dataframe: (dataframe), dataframe of selected X features """ def __init__(self, threshold_between_features, threshold_with_target, remove_highly_correlated_features, k_features): self.threshold_between_features = threshold_between_features self.threshold_with_target = threshold_with_target self.remove_highly_correlated_features = remove_highly_correlated_features self.k_features = k_features self.selected_features = list()
[docs] def fit(self, X, savepath, y=None, Xgroups=None): df = X df_features = df.columns.tolist() n_col = df.shape[1] if self.remove_highly_correlated_features == True: array_data = list() for i in range(n_col): col_data = df.iloc[:, i] col = list() for j in range(n_col): row_data = df.iloc[:, j] corr, _ = pearsonr(row_data, col_data) # Pearson Correlation col.append(corr) array_data.append(col) array_df = pd.DataFrame(array_data, index=df_features[:n_col], columns=df_features[:n_col]) array_df.to_excel(os.path.join(savepath, 'Full_correlation_matrix.xlsx')) #### Print features highly-correlated to each other into excel hcorr = dict() highly_correlated_features = list() for i in range(len(array_df.iloc[0, :])): # This includes all the data in the array_df # feature1 = array_df.iloc[:, i] # This does not work because feature1 is not the col name but a list of values feature1 = array_df.columns[i] for j in range(len(array_df.iloc[0, :])): # This includes all the data in the array_df # feature2 = array_df.iloc[:, j] # This does not work because feature2 is not the col name but a list of values feature2 = array_df.columns[j] if abs(array_df.iloc[i - 1, j - 1]) >= np.float64(self.threshold_between_features): if i != j: # Ignore diagonal features if not (feature2, feature1) in hcorr: # Ignore the same correlations hcorr[(feature1, feature2)] = array_df.iloc[i - 1, j - 1] highly_correlated_features.append(feature2) hcorr_df = pd.DataFrame(hcorr, index=["Corr"]) hcorr_df.to_excel(os.path.join(savepath, 'Highly_correlated_features.xlsx')) highly_correlated_features = list(np.unique(np.array(highly_correlated_features))) #### Print the removed features and the new smaller dataframe with features removed # Deep copy the original dataframe removed_features_df = copy.deepcopy(X) new_df = copy.deepcopy(X) # Drop the features that can be removed all_features = list(new_df.columns) removed_features = list() for feature in all_features: if feature not in highly_correlated_features: removed_features.append(feature) new_df = new_df.drop(columns=feature) # Print the highly correlated features that were removed for feature in all_features: if feature not in removed_features: removed_features_df = removed_features_df.drop(columns=feature) removed_features_df.to_excel(os.path.join(savepath, "Highly_correlated_features_removed.xlsx"), index=False) # Define self.selected_features remaining_features = list(new_df.columns) else: remaining_features = list(df.columns) # Compute Pearson correlations between each feature and target feature all_corrs = {} for i in range(len(remaining_features)): feature_name = df.columns[i] feature_data = df.iloc[:, i] corr, _ = pearsonr(y, feature_data) all_corrs[feature_name] = corr all_corrs = abs(pd.Series(all_corrs)) self.selected_features = list(all_corrs[all_corrs > self.threshold_with_target].sort_values( ascending=False).keys()) # Sometimes the specificed threshold is too high. Make it lower until at least 1 feature is selected while len(self.selected_features) < self.k_features: log.debug('WARNING: Pearson selector threshold was too high to result in selecting any features, lowering threshold to get specified feature number') self.threshold_with_target -= 0.05 self.selected_features = list(all_corrs[all_corrs > self.threshold_with_target].sort_values( ascending=False).keys()) if len(self.selected_features) == n_col: log.debug('WARNING: Pearson selector reduce the threshold such that all features were included') break log.debug('Pearson selector selected features with an adjusted threshold value') if len(self.selected_features) > self.k_features: self.selected_features = list(all_corrs[all_corrs > self.threshold_with_target].sort_values(ascending=False).keys())[:self.k_features] # Create a Pandas Excel writer using XlsxWriter as the engine. writer = pd.ExcelWriter(os.path.join(savepath, 'Features_highly_correlated_with_target.xlsx'), engine='xlsxwriter') # Create the dataframe displaying the highly correlated features and the Pearson Correlations hcorr_with_target_df = pd.DataFrame(all_corrs, index=list(all_corrs[all_corrs > self.threshold_with_target].sort_values( ascending=False).keys()), columns=["Pearson Correlation (absolute value)"]) hcorr_with_target_df.to_excel(writer, sheet_name='Sheet1', index=True) hcorr_with_target_features = list(hcorr_with_target_df.index) # Create dataframe containing the highly correlated features all_features = list(df.columns) for feature in all_features: if not feature in hcorr_with_target_features: df = df.drop(columns=feature) # Reorder the dataframe by columns (by their correlation to the target feature) df = df.reindex(columns=hcorr_with_target_features) # Print the dataframe to a spreadsheet df.to_excel(writer, sheet_name='Sheet2', index=False) # From left to right, the strength of correlation decreases. # Close the Pandas Excel writer and output the Excel file. writer.save() return self
[docs] def transform(self, X): dataframe = X[self.selected_features] return dataframe
[docs]class MASTMLFeatureSelector(object): """ Class custom-written for MAST-ML to conduct forward selection of features with flexible model and cv scheme Args: estimator: (scikit-learn model/estimator object), a scikit-learn model/estimator n_features_to_select: (int), the number of features to select cv: (scikit-learn cross-validation object), a scikit-learn cross-validation object manually_selected_features: (list), a list of features manually set by the user. The feature selector will first start from this list of features and sequentially add features until n_features_to_select is met. Methods: fit: performs feature selection Args: X: (dataframe), dataframe of X features y: (dataframe), dataframe of y data Xgroups: (dataframe), dataframe of group labels Returns: None transform: performs the transform to generate output of only selected features Args: X: (dataframe), dataframe of X features Returns: dataframe: (dataframe), dataframe of selected X features """ def __init__(self, estimator, n_features_to_select, cv, manually_selected_features=list()): self.estimator = estimator self.n_features_to_select = n_features_to_select self.cv = cv self.manually_selected_features = manually_selected_features self.selected_feature_names = self.manually_selected_features
[docs] def fit(self, X, y, savepath, Xgroups=None): if Xgroups.shape[0] == 0: xgroups = np.zeros(len(y)) Xgroups = pd.DataFrame(xgroups) selected_feature_avg_rmses = list() selected_feature_std_rmses = list() basic_forward_selection_dict = dict() num_features_selected = 0 x_features = X.columns.tolist() if self.n_features_to_select >= len(x_features): self.n_features_to_select = len(x_features) while num_features_selected < self.n_features_to_select: log.info('On number of features selected') log.info(str(num_features_selected)) # Catch pandas warnings here with warnings.catch_warnings(): warnings.simplefilter('ignore') ranked_features = self._rank_features(X=X, y=y, groups=Xgroups) top_feature_name, top_feature_avg_rmse, top_feature_std_rmse = self._choose_top_feature(ranked_features=ranked_features) self.selected_feature_names.append(top_feature_name) if len(self.selected_feature_names) > 0: log.info('selected features') log.info(self.selected_feature_names) selected_feature_avg_rmses.append(top_feature_avg_rmse) selected_feature_std_rmses.append(top_feature_std_rmse) basic_forward_selection_dict[str(num_features_selected)] = dict() basic_forward_selection_dict[str(num_features_selected)][ 'Number of features selected'] = num_features_selected + 1 basic_forward_selection_dict[str(num_features_selected)][ 'Top feature added this iteration'] = top_feature_name basic_forward_selection_dict[str(num_features_selected)][ 'Avg RMSE using top features'] = top_feature_avg_rmse basic_forward_selection_dict[str(num_features_selected)][ 'Stdev RMSE using top features'] = top_feature_std_rmse # Save for every loop of selecting features pd.DataFrame(basic_forward_selection_dict).to_csv(os.path.join(savepath,'MASTMLFeatureSelector_data_feature_'+str(num_features_selected)+'.csv')) num_features_selected += 1 basic_forward_selection_dict[str(self.n_features_to_select - 1)][ 'Full feature set Names'] = self.selected_feature_names basic_forward_selection_dict[str(self.n_features_to_select - 1)][ 'Full feature set Avg RMSEs'] = selected_feature_avg_rmses basic_forward_selection_dict[str(self.n_features_to_select - 1)][ 'Full feature set Stdev RMSEs'] = selected_feature_std_rmses #self._plot_featureselected_learningcurve(selected_feature_avg_rmses=selected_feature_avg_rmses, # selected_feature_std_rmses=selected_feature_std_rmses) return self
[docs] def transform(self, X): dataframe = self._get_featureselected_dataframe(X=X, selected_feature_names=self.selected_feature_names) return dataframe
def _rank_features(self, X, y, groups): y = np.array(y).reshape(-1, 1) ranked_features = dict() trains_metrics = list() tests_metrics = list() if groups is not None: groups = groups.iloc[:,0].tolist() for col in X.columns: if col not in self.selected_feature_names: X_ = X.loc[:, self.selected_feature_names] X__ = X.loc[:, col] X_ = np.array(pd.concat([X_, X__], axis=1)) for trains, tests in self.cv.split(X_, y, groups): self.estimator.fit(X_[trains], y[trains]) predict_tests = self.estimator.predict(X_[tests]) tests_metrics.append(root_mean_squared_error(y[tests], predict_tests)) avg_rmse = np.mean(tests_metrics) std_rmse = np.std(tests_metrics) ranked_features[col] = {"avg_rmse": avg_rmse, "std_rmse": std_rmse} return ranked_features def _choose_top_feature(self, ranked_features): feature_names = list() feature_avg_rmses = list() feature_std_rmses = list() feature_names_sorted = list() feature_std_rmses_sorted = list() # Make dict of ranked features into list for sorting for k, v in ranked_features.items(): feature_names.append(k) for kk, vv in v.items(): if kk == 'avg_rmse': feature_avg_rmses.append(vv) if kk == 'std_rmse': feature_std_rmses.append(vv) # Sort feature lists so RMSE goes from min to max feature_avg_rmses_sorted = sorted(feature_avg_rmses) for feature_avg_rmse in feature_avg_rmses_sorted: for k, v in ranked_features.items(): if v['avg_rmse'] == feature_avg_rmse: feature_names_sorted.append(k) feature_std_rmses_sorted.append(v['std_rmse']) top_feature_name = feature_names_sorted[0] top_feature_avg_rmse = feature_avg_rmses_sorted[0] top_feature_std_rmse = feature_std_rmses_sorted[0] return top_feature_name, top_feature_avg_rmse, top_feature_std_rmse def _get_featureselected_dataframe(self, X, selected_feature_names): # Return dataframe containing only selected features X_selected = X.loc[:, selected_feature_names] return X_selected
# Include Principal Component Analysis PCA.transform = dataframify_new_column_names(PCA.transform, 'pca_') # Include Sequential Forward Selector SequentialFeatureSelector.transform = dataframify_new_column_names(SequentialFeatureSelector.transform, 'sfs_') SequentialFeatureSelector.fit = fitify_just_use_values(SequentialFeatureSelector.fit) model_selectors['SequentialFeatureSelector'] = SequentialFeatureSelector name_to_constructor['SequentialFeatureSelector'] = SequentialFeatureSelector # Custom selectors don't need to be dataframified name_to_constructor.update({ #'PassThrough': PassThrough, 'DoNothing': util_legos.DoNothing, 'PCA': PCA, 'SequentialFeatureSelector': SequentialFeatureSelector, 'MASTMLFeatureSelector' : MASTMLFeatureSelector, 'PearsonSelector': PearsonSelector, 'EnsembleModelFeatureSelector': EnsembleModelFeatureSelector })