Source code for mastml.data_loader

"""
The data_loader module is used for importing data from user-specified csv or xlsx file to MAST-ML
"""

import pandas as pd
import logging
from mastml import utils
log = logging.getLogger('mastml')

[docs]def load_data(file_path, input_features=None, input_target=None, input_grouping= None, feature_blacklist=list()): """ Method that accepts the filepath of an input data file and returns a full dataframe and parsed X and y dataframes Args: file_path: (str), path to data file input_features: (str), column names to be used as input features (X data). If 'Auto', then takes all columns that are not listed in target_feature or feature_blacklist fields. target_feature: (str), column name for data to be fit to (y data). grouping_feature: (str), column names used to group data in user-defined grouping scheme Returns: df: (dataframe), full dataframe of the input X data (y data is removed) X: (dataframe), dataframe containing only the X data from the data file X_noinput: (dataframe), dataframe containing the columns of the original X data that are not used as input features X_grouped: (dataframe), dataframe containing the columns of hte original X data that correspond to a data grouping scheme y: (dataframe), dataframe containing only the y data from the data file """ # Load data try: df = pd.read_csv(file_path) except: try: df = pd.read_excel(file_path) except: df = pd.read_excel(file_path, engine='openpyxl') # Assign default values to input_features and target_feature; if input_features is None and input_target is None: # input is first n-1 and target is just n input_features = list(df.columns[:-1]) target_feature = df.columns[-1] elif input_features is None: # input is all the features except the target feature input_features = [col for col in df.columns if col != input_target] elif input_target is None: # target is the last non-input feature for col in df.columns[::-1]: if col not in input_features: target_feature = col break # Collect required features: if type(input_features) is str: input_features = [input_features] required_features = input_features + [input_target] # Ensure they are all present: for feature in required_features: if feature not in df.columns: raise Exception(f"Data file does not have column '{feature}'") X, y = df[input_features], df[input_target] log.info('blacklisted features, either from "input_other" or a "input_grouping":' + str(feature_blacklist)) # take blacklisted features out of X: X_noinput_dict = dict() for feature in set(feature_blacklist): # If input_features = Auto, all included and blacklisted features need removal; if manual may not have all features if feature in X.columns: X_noinput_dict[feature] = X[feature] X = X.drop(feature, axis=1) else: log.info('Blacklisted feature ' + str(feature) + ' already not present in dataframe') # Need this block when input features not set to Auto for feature in set(feature_blacklist): if feature not in X_noinput_dict.keys(): X_noinput_dict[feature] = df[feature] X_noinput = pd.DataFrame(X_noinput_dict) if input_grouping: X_grouped = pd.DataFrame(df[input_grouping]) else: X_grouped = None df = df.drop(input_target, axis=1) #Check if features are unambiguously selected for feature in X_noinput.columns: if feature in X.columns: raise utils.ConfError('An error has occurred where the same feature in both the "input_features" and ' '"input_other" fields. Please correct your input file and re-run MAST-ML') return df, X, X_noinput, X_grouped, y