Source code for mastml.datasets

"""
This module provides various methods for importing data into MAST-ML.

SklearnDatasets:
    Enables easy import of model datasets from scikit-learn, such as boston housing data, friedman, etc.

LocalDatasets:
    Main method for importing datasets that are stored in an accessible path. Main file format is Excel
    spreadsheet (.xls or .xlsx). This method also makes it easy for separately denoting other data features
    that are not directly the X or y data, such as features used for grouping, extra features no used in
    fitting, or features that denote manually held-out test data

FigshareDatasets:
    Method to download data that is stored on Figshare, an open-source data hosting service. This class
    can be used to download data, then subsquently the LocalDatasets class can be used to import the data.

FoundryDatasets:
    Method to download data this stored on the Materials Data Facility (MDF) Foundry data hosting service.
    This class can be used to download data, then subsquently the LocalDatasets class can be used to import
    the data.

MatminerDatasets:
    Method to download data this stored as part of the matminer machine learning package
    (https://github.com/hackingmaterials/matminer). This class can be used to download data, then
    subsquently the LocalDatasets class can be used to import the data.

"""

import pandas as pd
import os
import numpy as np
from pprint import pprint
import shutil
import pickle

import sklearn.datasets
from mdf_forge import Forge

from matminer.datasets.dataset_retrieval import load_dataset, get_available_datasets

try:
    from figshare.figshare.figshare import Figshare
except:
    print('Figshare is an optional dependency. To import data from figshare, manually install figshare via git clone of '
          'git clone https://github.com/cognoma/figshare.git')


[docs]class SklearnDatasets(): """ Class wrapping the sklearn.datasets funcionality for easy import of toy datasets from sklearn. Added some changes to make all datasets operate more consistently, e.g. boston housing data Args: return_X_y: (bool), whether to return X, y data as (X, y) tuple (should be true for easiest use in MASTML) as_frame: (bool), whether to return X, y data as pandas dataframe objects n_class: (int), number of classes (only applies to load_digits method) Methods: load_boston: Loads the Boston housing data (regression) load_iris: Loads the flower iris data (classification) load_diabetes: Loads the diabetes data set (regression) load_digits: Loads the MNIST digits data set (classification) load_linnerud: Loads the linnerud data set (regression) load_wine: Loads the wine data set (classification) load_breast_cancer: Loads the breast cancer data set (classification) load_friedman: Loads the Friedman data set (regression) """ def __init__(self, return_X_y=True, as_frame=False): self.return_X_y = return_X_y self.as_frame = as_frame
[docs] def load_boston(self): if self.as_frame: boston = sklearn.datasets.load_boston() X = pd.DataFrame(boston.data, columns=boston.feature_names) y = pd.DataFrame(boston.target, columns=['MEDV']) return X, y return sklearn.datasets.load_boston(return_X_y=self.return_X_y)
[docs] def load_iris(self): return sklearn.datasets.load_iris(return_X_y=self.return_X_y, as_frame=self.as_frame)
[docs] def load_diabetes(self): return sklearn.datasets.load_diabetes(return_X_y=self.return_X_y, as_frame=self.as_frame)
[docs] def load_digits(self, n_class=10): return sklearn.datasets.load_digits(return_X_y=self.return_X_y, as_frame=self.as_frame, n_class=n_class)
[docs] def load_linnerud(self): return sklearn.datasets.load_linnerud(return_X_y=self.return_X_y, as_frame=self.as_frame)
[docs] def load_wine(self): return sklearn.datasets.load_wine(return_X_y=self.return_X_y, as_frame=self.as_frame)
[docs] def load_breast_cancer(self): return sklearn.datasets.load_breast_cancer(return_X_y=self.return_X_y, as_frame=self.as_frame)
[docs] def load_friedman(self, n_samples=100, n_features=10, noise=0.0): X, y = sklearn.datasets.make_friedman1(n_samples=n_samples, n_features=n_features, noise=noise) if self.as_frame: return pd.DataFrame(X, columns=["x"+str(i) for i in range(n_features)]), pd.DataFrame(y, columns=['target']) else: return X, y
[docs]class LocalDatasets(): """ Class to handle import and organization of a dataset stored locally. Args: file_path: (str), path to the data file to import feature_names: (list), list of strings containing the X feature names target: (str), string denoting the y data (target) name extra_columns: (list), list of strings containing additional column names that are not features or target group_column: (str), string denoting the name of an input column to be used to group data testdata_columns: (list), list of strings containing column names denoting sets of left-out data. Entries should be marked with a 0 (not left out) or 1 (left out) average_duplicates: (bool), whether to average duplicate entries from the imported data. average_duplicates_col: (str), string denoting column name to perform averaging of duplicate entries. Needs to be specified if average_duplicates is True. as_frame: (bool), whether to return data as pandas dataframe (otherwise will be numpy array) Methods: _import: imports the data. Should be either .csv or .xlsx format Args: None Returns: df: (pd.DataFrame), pandas dataframe of full dataset _get_features: Method to assess which columns below to target, feature_names Args: df: (pd.DataFrame), pandas dataframe of full dataset Returns: None load_data: Method to import the data and ascertain which columns are features, target and extra based on provided input. Args: copy: (bool), whether or not to copy the imported data to the designated savepath savepath: (str), path to save the data to (used if copy=True) Returns: data_dict: (dict), dictionary containing dataframes of X, y, groups, X_extra, X_testdata """ def __init__(self, file_path, feature_names=None, target=None, extra_columns=None, group_column=None, testdata_columns=None, average_duplicates=False, average_duplicates_col=None, as_frame=False): self.file_path = file_path self.feature_names = feature_names self.target = target self.extra_columns = extra_columns self.group_column = group_column self.testdata_columns = testdata_columns self.average_duplicates = average_duplicates self.average_duplicates_col = average_duplicates_col self.as_frame = as_frame if self.extra_columns is None: self.extra_columns = list() if self.testdata_columns is None: self.testdata_columns = list() def _import(self): fname, ext = os.path.splitext(self.file_path) if ext == '.csv': df = pd.read_csv(self.file_path) elif ext == '.xlsx': try: df = pd.read_excel(self.file_path) except: df = pd.read_excel(self.file_path, engine='openpyxl') elif ext == '.pickle': with open(self.file_path, "rb") as input_file: data = pickle.load(input_file) df = pd.DataFrame(data) else: raise ValueError('file_path must be .csv, .xlsx or .pickle for data local data import') return df def _get_features(self, df): if self.feature_names is None and self.target is None: print('WARNING: feature_names and target are not specified. Assuming last column is target value and remaining columns are features') self.target = df.columns[-1] self.feature_names = [col for col in df.columns if col not in [self.extra_columns, self.target, self.testdata_columns]] elif self.feature_names is None: # input is all the features except the target feature print('WARNING: feature_names not specified but target was specified. Assuming all columns except target and extra columns are features') cols = [col for col in df.columns if col != self.target] self.feature_names = [col for col in cols if col not in self.extra_columns and col not in self.testdata_columns] elif self.target is None: # target is the last non-input feature for col in df.columns[::-1]: if col not in self.feature_names: target = col break return
[docs] def load_data(self, copy=False, savepath=None): if copy == True: if savepath is not None: fname = os.path.normpath(self.file_path).split(os.path.sep)[-1] shutil.copy(self.file_path, os.path.join(savepath, fname)) data_dict = dict() # Import data from file df = self._import() # Average duplicate entries, if specified if self.average_duplicates == True: if self.average_duplicates_col is not None: df = df.groupby(df[self.average_duplicates_col]).mean().reset_index() else: print('Error: you need to specify average_duplicates_col if average_duplicates is True') # Assign default values to input_features and target_feature self._get_features(df=df) X, y = df[self.feature_names], pd.DataFrame(df[self.target], columns=[self.target]).squeeze() data_dict['X'] = X data_dict['y'] = y if self.group_column: groups = df[self.group_column] data_dict['groups'] = groups else: data_dict['groups'] = None if self.extra_columns: X_extra = df[self.extra_columns] data_dict['X_extra'] = X_extra else: data_dict['X_extra'] = None if self.testdata_columns: X_testdata = list() for col in self.testdata_columns: X_testdata.append(np.array(df.loc[df[col] == 1].index).ravel()) data_dict['X_testdata'] = X_testdata else: data_dict['X_testdata'] = None if self.as_frame == True: return data_dict else: for k, v in data_dict.items(): data_dict[k] = np.array(v) return data_dict
[docs]class FigshareDatasets(): """ Class to download datasets hosted on Figshare. To install: git clone https://github.com/cognoma/figshare.git Args: None Methods: download_data: downloads specified data from Figshare and saves to current directory Args: article_id: (int), the number denoting the Figshare article ID. Can be obtained from the URL to the Figshare dataset savepath: (str), string denoting the savepath of the MAST-ML run Returns: None """ def __init__(self): pass
[docs] def download_data(self, article_id, savepath=None): fs = Figshare() fs.retrieve_files_from_article(article_id) if savepath: try: shutil.move(os.path.join(os.getcwd(), 'figshare_'+str(article_id)), savepath) except shutil.Error: print('Warning: could not move downloaded data to specified savepath, maybe because savepath is the current working directory') return
[docs]class FoundryDatasets(): """ Class to download datasets hosted on Materials Data Facility Args: no_local_server: (bool), whether or not the server is local. Set to True if running on e.g. Google Colab anonymous: (bool), whether to use your MDF user or be anonymous. Some functionality may be disabled if True test: (bool), whether to be in test mode. Some functionality may be disabled if True Methods: download_data: downloads specified data from MDF and saves to current directory Args: name: (str), name of the dataset to download doi: (str), digital object identifier of the dataset to download download: (bool), whether or not to download the full dataset Returns: None """ def __init__(self, no_local_server, anonymous, test): self.no_local_server = no_local_server self.anonymous = anonymous self.test = test self.mdf = Forge(no_local_server=self.no_local_server, anonymous=self.anonymous, test=self.test)
[docs] def download_data(self, name=None, doi=None, download=False): if name is not None: self.mdf.match_source_names(name) elif doi is not None: self.mdf.match_dois(doi) else: print('ERROR: please specify either the dataset name or DOI for lookup MDF') result = self.mdf.search() if len(result) == 1: print('Successfully found the desired dataset on MDF') print('MDF entry:') pprint(result) if download == True: print('Downloading dataset from MDF') self.mdf.globus_download(results=result) return
[docs]class MatminerDatasets(): """ Class to download datasets hosted from the Matminer package's Figshare page. A summary of available datasets can be found at: https://hackingmaterials.lbl.gov/matminer/dataset_summary.html Args: None Methods: download_data: downloads specified data from Matminer/Figshare and saves to current directory Args: name: (str), name of the dataset to download. For compatible names, call get_available_datasets save_data: (bool), whether to save the downloaded data to the current working directory Returns: df: (dataframe), dataframe of downloaded data get_available_datasets: returns information on the available dataset names and details one can downlaod Args: None. Returns: None. """ def __init__(self): pass
[docs] def download_data(self, name, save_data=True): df = load_dataset(name=name) if save_data == True: df.to_excel(name+'.xlsx', index=False) with open('%s.pickle' % name, 'wb') as data_file: pickle.dump(df, data_file) return df
[docs] def get_available_datasets(self): datasets = get_available_datasets() return