Source code for mastml.conf_parser

"""
The conf_parser module is used for handling, parsing, and checking MAST-ML input configuration files
"""

from sklearn.metrics import make_scorer
from configobj import ConfigObj
import logging

from mastml import metrics, utils
from mastml.legos.model_finder import check_models_mixed
from mastml.legos import feature_selectors, model_finder

log = logging.getLogger('mastml')

[docs]def parse_conf_file(filepath, from_dict=False):
    """
    Method that accepts the filepath of an input configuration file and returns its parsed dictionary

    Args:
        filepath: (str), path to config file, or a dict of config values directly

    Returns:
        conf: (dict): dictionary parsed from config file

    """

    if from_dict == False:
        conf = ConfigObj(filepath)
    else:
        conf = filepath # The filepath in this case is an actual dictionary of values used as the config fiel

    main_sections = ['GeneralSetup', 'DataSplits', 'Models', 'LearningCurve', 'DataCleaning', 'HyperOpt', 'ModelHosting']
    feature_sections = ['FeatureGeneration', 'Clustering',
                        'FeatureNormalization', 'FeatureSelection']
    feature_section_dicts = [conf[name] for name in feature_sections if name in conf]

    def set_required_sections_to_empty():
        for name in main_sections:
            if name not in conf:
                conf[name] = dict()
    set_required_sections_to_empty()

    def check_unknown_sections():
        all_sections = main_sections + feature_sections + ['MiscSettings']
        for section_name in conf:
            if section_name not in all_sections:
                raise Exception(f'[{section_name}] is not a valid section!'
                                f' Valid sections: {all_sections}')
    check_unknown_sections()

    def verify_subsection_only_sections():
        for dictionary in [conf, conf['DataSplits'], conf['Models']] + feature_section_dicts:
            for name, value in dictionary.items():
                if not isinstance(value, dict):
                    raise utils.InvalidConfSubSection(
                            f"Parameter in subsection-only section: {name}={value}")
    verify_subsection_only_sections()

    def parameter_dict_type_check_and_cast():
        parameter_dicts = list()
        parameter_dicts.extend(conf['Models'].values())
        parameter_dicts.extend(conf['DataSplits'].values())
        #parameter_dicts = conf['Models'].values() + conf['DataSplits'].values()
        for feature_section in feature_section_dicts:
            parameter_dicts.extend(feature_section.values())

        for parameter_dict in parameter_dicts:
            for name, value in parameter_dict.items():
                # Does this parameter-only section include a subsection?
                if isinstance(value, dict):
                    continue
                    #raise utils.InvalidConfSubSection(
                    #        f"Subsection in parameter-only section: {key}")
                # cast the strings to their respective types
                parameter_dict[name] = fix_types(value)
    parameter_dict_type_check_and_cast()

    # Ensure all models are either classifiers or regressors: (raises error if mixed)
    is_classification = conf['is_classification'] = check_models_mixed(
            key.split('_')[0] for key in conf['Models'])

    # Add the empty splitter if no splitters are specified:
    if conf['DataSplits'] == dict():
        conf['DataSplits']['NoSplit'] = dict()

    def set_unspecified_sections_to_empty_dict():
        for name in feature_sections:
            if name not in conf or conf[name] == dict():
                if name == 'Clustering':
                    conf[name] = dict()
                else:
                    conf[name] = {'DoNothing': dict()}
    set_unspecified_sections_to_empty_dict()

    GS = conf['GeneralSetup']

    def check_general_setup_settings_are_valid():
        #all_settings =  ['input_features', 'target_feature', 'metrics',
        #                 'randomizer', 'validation_columns', 'not_input_features', 'grouping_feature']
        all_settings =  ['input_features', 'input_target', 'metrics',
                         'randomizer', 'input_testdata', 'input_other', 'input_grouping']
        for name in GS:
            if name not in all_settings:
                raise utils.InvalidConfParameters(
                        f"[GeneralSetup] contains unknown setting {name}.\n"
                        f"Valid GeneralSetup options are: {all_settings}")
    check_general_setup_settings_are_valid()

    if 'input_grouping' not in GS:
        GS['input_grouping'] = None

    # Find grouping features and 'not_input_features' to blacklist out of X (see data loader)
    def collect_grouping_features():
        for section in conf:
            if not isinstance(conf[section], dict):
                continue
            for subsection in conf[section]:
                SS = conf[section][subsection]
                if not isinstance(SS, dict):
                    continue
                if 'input_grouping' in SS.keys():
                    logging.debug('found input_grouping feature: ' + SS['input_grouping'])
                    yield SS['input_grouping']
    # Issue here where if clusters are automatically generated, new column is made but isn't in intitial df, even though
    # listed as grouping_feature. Here, just have to remember to put grouping_feature names in not_input_features
    #feature_blacklist = list(collect_grouping_features())
    feature_blacklist = list()
    # default not_input_features to a list
    if 'input_other' not in GS:
        GS['input_other'] = list()
    else:
        if type(GS['input_other']) is str:
            new_list = list()
            new_list.append(GS['input_other'])
            GS['input_other'] = new_list
        elif type(GS['input_other']) is list:
            pass

    # and add the discovered ones to the list
    GS['input_other'] += feature_blacklist
    #GS['not_input_features'] = [f for f in feature_blacklist if f not in GS['not_input_features']]

    def set_randomizer_setting():
        if 'randomizer' in GS:
            GS['randomizer'] = mybool(GS['randomizer'])
        else:
            GS['randomizer'] = False
    set_randomizer_setting()


    def set_default_features():
        for name in ['input_features', 'input_target']:
            if (name not in GS) or (GS[name] == 'Auto'):
                GS[name] = None
    set_default_features()

    def set_default_metrics():
        if 'metrics' not in GS or GS['metrics'] == 'Auto':
            if is_classification:
                GS['metrics'] = ['accuracy', 'precision_binary', 'recall_binary', 'f1_binary']
            else:
                GS['metrics'] = ['R2', 'root_mean_squared_error',
                                 'mean_absolute_error', 'rmse_over_stdev']
    set_default_metrics()

    # Turn names of metrics into actual metrics:
    # If only one metric, map string to list for parsing
    if type(GS['metrics']) is str:
        el = GS['metrics']
        GS['metrics'] = list()
        GS['metrics'].append(el)

    GS['metrics'] = metrics.check_and_fetch_names(GS['metrics'], is_classification)

    def change_score_func_strings_into_actual_score_funcs():
        for selector_name, args_dict in conf['FeatureSelection'].items():
            class_name = selector_name.split('_')[0]
            if class_name not in feature_selectors.score_func_selectors: continue
            name_to_func = (metrics.classification_score_funcs
                            if is_classification
                            else metrics.regression_score_funcs)
            if 'score_func' in args_dict:
                try:
                    args_dict['score_func'] = name_to_func[args_dict['score_func']]
                except KeyError:
                    task = 'classification' if is_classification else 'regression'
                    raise utils.InvalidValue(
                            f"Score function '{args_dict['score_func']}' not valid for {task}"
                            f" tasks (inside feature selector {selector_name}). Valid score"
                            f" functions: {list(name_to_func.keys())}")
            else: # default to f_classif or f_regression
                args_dict['score_func'] = \
                        name_to_func['f_classif' if is_classification else 'f_regression']
    change_score_func_strings_into_actual_score_funcs()

    def make_long_name_short_name_pairs():
        dictionaries = ([conf['DataSplits'], conf['Models'], conf['HyperOpt']]
                        + [conf[name] for name in feature_sections])
        for dictionary in dictionaries:
            for name, settings in dictionary.items():
                #dictionary[name] = (name.split('_')[0], settings)
                dictionary[name] = [name.split('_')[0], settings]
    make_long_name_short_name_pairs()

    def check_and_boolify_plot_settings():
        default_false = ['plot_each_feature_vs_target', 'rf_error_method', 'rf_error_percentile',
                         'normalize_target_feature']
        default_true = ['plot_target_histogram', 'plot_train_test_plots', 'plot_predicted_vs_true', 'plot_error_plots',
                         'plot_predicted_vs_true_average', 'plot_best_worst_per_point']
        all_settings = default_false + default_true
        if 'MiscSettings' not in conf:
            conf['MiscSettings'] = dict()
        MS = conf['MiscSettings']
        for name, value in MS.items():
            if name not in all_settings:
                raise utils.InvalidConfParameters(f"[MiscSettings] parameter '{name}' is unknown")
            try:
                MS[name] = mybool(value)
            except ValueError:
                pass
            #    raise utils.InvalidConfParameters(
            #        f"[PlotSettings] parameter '{name}' must be a boolean")
        for name in default_false:
            if name not in MS:
                MS[name] = False
        for name in default_true:
            if name not in MS:
                MS[name] = True
    check_and_boolify_plot_settings()

    # TODO: remove?
    def check_learning_curve_settings():
        if 'learning_curve_model' not in GS:
            raise utils.InvalidConfParameters("You enabled data_learning_curve plots but you did"
                                              "not specify learning_curve_model in [GeneralSetup]")
        if 'learning_curve_score' not in GS:
            raise utils.InvalidConfParameters("You enabled data_learning_curve plots but you did"
                                              "not specify learning_curve_score in [GeneralSetup]")

    # Need to make scoring function for learning curve string to scorer object
    if conf['LearningCurve']:
        score_name = conf['LearningCurve']['scoring']
        d = metrics.check_and_fetch_names([score_name], is_classification)
        greater_is_better, score_func = d[score_name]
        conf['LearningCurve']['scoring'] = make_scorer(score_func, greater_is_better=True)

    return conf

[docs]def fix_types(maybe_list):
    """
    Method that returns true datatype of values passed as string or list of strings, parsed from configuration file

    Args:
        maybe_list: (list, str), a list of strings or just a string whose datatype should be e.g. int or list of float

    Returns:
        maybe_list: (list, bool, int, float): a list of items or other data type converted from string to correct data type

    """
    if isinstance(maybe_list, list):
        return [fix_types(item) for item in maybe_list]

    try: return mybool(maybe_list)
    except ValueError: pass

    try: return int(maybe_list)
    except ValueError: pass

    try: return float(maybe_list)
    except ValueError: pass

    return str(maybe_list)

[docs]def mybool(string):
    """
    Method that converts a string equal to 'True' or 'False' into type bool

    Args:
        string: (str), a string as 'True' or 'False'

    Returns:
        bool: (bool): bool as True or False

    """
    if string.lower() == 'true':
        return True
    if string.lower() == 'false':
        return False
    raise ValueError