"""
This module provides various methods for importing data into MAST-ML.
SklearnDatasets:
Enables easy import of model datasets from scikit-learn, such as boston housing data, friedman, etc.
LocalDatasets:
Main method for importing datasets that are stored in an accessible path. Main file format is Excel
spreadsheet (.xls or .xlsx). This method also makes it easy for separately denoting other data features
that are not directly the X or y data, such as features used for grouping, extra features no used in
fitting, or features that denote manually held-out test data
FigshareDatasets:
Method to download data that is stored on Figshare, an open-source data hosting service. This class
can be used to download data, then subsquently the LocalDatasets class can be used to import the data.
FoundryDatasets:
Method to download data this stored on the Materials Data Facility (MDF) Foundry data hosting service.
This class can be used to download data, then subsquently the LocalDatasets class can be used to import
the data.
MatminerDatasets:
Method to download data this stored as part of the matminer machine learning package
(https://github.com/hackingmaterials/matminer). This class can be used to download data, then
subsquently the LocalDatasets class can be used to import the data.
"""
import pandas as pd
import os
import numpy as np
from pprint import pprint
import shutil
import pickle
import sklearn.datasets
from mdf_forge import Forge
from matminer.datasets.dataset_retrieval import load_dataset, get_available_datasets
try:
from figshare.figshare.figshare import Figshare
except:
print('Figshare is an optional dependency. To import data from figshare, manually install figshare via git clone of '
'git clone https://github.com/cognoma/figshare.git')
[docs]class SklearnDatasets():
"""
Class wrapping the sklearn.datasets funcionality for easy import of toy datasets from sklearn. Added some changes
to make all datasets operate more consistently, e.g. boston housing data
Args:
return_X_y: (bool), whether to return X, y data as (X, y) tuple (should be true for easiest use in MASTML)
as_frame: (bool), whether to return X, y data as pandas dataframe objects
n_class: (int), number of classes (only applies to load_digits method)
Methods:
load_boston: Loads the Boston housing data (regression)
load_iris: Loads the flower iris data (classification)
load_diabetes: Loads the diabetes data set (regression)
load_digits: Loads the MNIST digits data set (classification)
load_linnerud: Loads the linnerud data set (regression)
load_wine: Loads the wine data set (classification)
load_breast_cancer: Loads the breast cancer data set (classification)
load_friedman: Loads the Friedman data set (regression)
"""
def __init__(self, return_X_y=True, as_frame=False):
self.return_X_y = return_X_y
self.as_frame = as_frame
[docs] def load_boston(self):
if self.as_frame:
boston = sklearn.datasets.load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.DataFrame(boston.target, columns=['MEDV'])
return X, y
return sklearn.datasets.load_boston(return_X_y=self.return_X_y)
[docs] def load_iris(self):
return sklearn.datasets.load_iris(return_X_y=self.return_X_y, as_frame=self.as_frame)
[docs] def load_diabetes(self):
return sklearn.datasets.load_diabetes(return_X_y=self.return_X_y, as_frame=self.as_frame)
[docs] def load_digits(self, n_class=10):
return sklearn.datasets.load_digits(return_X_y=self.return_X_y, as_frame=self.as_frame, n_class=n_class)
[docs] def load_linnerud(self):
return sklearn.datasets.load_linnerud(return_X_y=self.return_X_y, as_frame=self.as_frame)
[docs] def load_wine(self):
return sklearn.datasets.load_wine(return_X_y=self.return_X_y, as_frame=self.as_frame)
[docs] def load_breast_cancer(self):
return sklearn.datasets.load_breast_cancer(return_X_y=self.return_X_y, as_frame=self.as_frame)
[docs] def load_friedman(self, n_samples=100, n_features=10, noise=0.0):
X, y = sklearn.datasets.make_friedman1(n_samples=n_samples, n_features=n_features, noise=noise)
if self.as_frame:
return pd.DataFrame(X, columns=["x"+str(i) for i in range(n_features)]), pd.DataFrame(y, columns=['target'])
else:
return X, y
[docs]class LocalDatasets():
"""
Class to handle import and organization of a dataset stored locally.
Args:
file_path: (str), path to the data file to import
feature_names: (list), list of strings containing the X feature names
target: (str), string denoting the y data (target) name
extra_columns: (list), list of strings containing additional column names that are not features or target
group_column: (str), string denoting the name of an input column to be used to group data
testdata_columns: (list), list of strings containing column names denoting sets of left-out data. Entries should be marked with a 0 (not left out) or 1 (left out)
average_duplicates: (bool), whether to average duplicate entries from the imported data.
average_duplicates_col: (str), string denoting column name to perform averaging of duplicate entries. Needs to be specified if average_duplicates is True.
as_frame: (bool), whether to return data as pandas dataframe (otherwise will be numpy array)
Methods:
_import: imports the data. Should be either .csv or .xlsx format
Args:
None
Returns:
df: (pd.DataFrame), pandas dataframe of full dataset
_get_features: Method to assess which columns below to target, feature_names
Args:
df: (pd.DataFrame), pandas dataframe of full dataset
Returns:
None
load_data: Method to import the data and ascertain which columns are features, target and extra based on provided input.
Args:
copy: (bool), whether or not to copy the imported data to the designated savepath
savepath: (str), path to save the data to (used if copy=True)
Returns:
data_dict: (dict), dictionary containing dataframes of X, y, groups, X_extra, X_testdata
"""
def __init__(self, file_path, feature_names=None, target=None, extra_columns=None, group_column=None,
testdata_columns=None, average_duplicates=False, average_duplicates_col=None, as_frame=False):
self.file_path = file_path
self.feature_names = feature_names
self.target = target
self.extra_columns = extra_columns
self.group_column = group_column
self.testdata_columns = testdata_columns
self.average_duplicates = average_duplicates
self.average_duplicates_col = average_duplicates_col
self.as_frame = as_frame
if self.extra_columns is None:
self.extra_columns = list()
if self.testdata_columns is None:
self.testdata_columns = list()
def _import(self):
fname, ext = os.path.splitext(self.file_path)
if ext == '.csv':
df = pd.read_csv(self.file_path)
elif ext == '.xlsx':
try:
df = pd.read_excel(self.file_path)
except:
df = pd.read_excel(self.file_path, engine='openpyxl')
elif ext == '.pickle':
with open(self.file_path, "rb") as input_file:
data = pickle.load(input_file)
df = pd.DataFrame(data)
else:
raise ValueError('file_path must be .csv, .xlsx or .pickle for data local data import')
return df
def _get_features(self, df):
if self.feature_names is None and self.target is None:
print('WARNING: feature_names and target are not specified. Assuming last column is target value and remaining columns are features')
self.target = df.columns[-1]
self.feature_names = [col for col in df.columns if col not in [self.extra_columns, self.target, self.testdata_columns]]
elif self.feature_names is None: # input is all the features except the target feature
print('WARNING: feature_names not specified but target was specified. Assuming all columns except target and extra columns are features')
cols = [col for col in df.columns if col != self.target]
self.feature_names = [col for col in cols if col not in self.extra_columns and col not in self.testdata_columns]
elif self.target is None: # target is the last non-input feature
for col in df.columns[::-1]:
if col not in self.feature_names:
target = col
break
return
[docs] def load_data(self, copy=False, savepath=None):
if copy == True:
if savepath is not None:
fname = os.path.normpath(self.file_path).split(os.path.sep)[-1]
shutil.copy(self.file_path, os.path.join(savepath, fname))
data_dict = dict()
# Import data from file
df = self._import()
# Average duplicate entries, if specified
if self.average_duplicates == True:
if self.average_duplicates_col is not None:
df = df.groupby(df[self.average_duplicates_col]).mean().reset_index()
else:
print('Error: you need to specify average_duplicates_col if average_duplicates is True')
# Assign default values to input_features and target_feature
self._get_features(df=df)
X, y = df[self.feature_names], pd.DataFrame(df[self.target], columns=[self.target]).squeeze()
data_dict['X'] = X
data_dict['y'] = y
if self.group_column:
groups = df[self.group_column]
data_dict['groups'] = groups
else:
data_dict['groups'] = None
if self.extra_columns:
X_extra = df[self.extra_columns]
data_dict['X_extra'] = X_extra
else:
data_dict['X_extra'] = None
if self.testdata_columns:
X_testdata = list()
for col in self.testdata_columns:
X_testdata.append(np.array(df.loc[df[col] == 1].index).ravel())
data_dict['X_testdata'] = X_testdata
else:
data_dict['X_testdata'] = None
if self.as_frame == True:
return data_dict
else:
for k, v in data_dict.items():
data_dict[k] = np.array(v)
return data_dict
[docs]class FigshareDatasets():
"""
Class to download datasets hosted on Figshare. To install: git clone https://github.com/cognoma/figshare.git
Args:
None
Methods:
download_data: downloads specified data from Figshare and saves to current directory
Args:
article_id: (int), the number denoting the Figshare article ID. Can be obtained from the URL to the Figshare dataset
savepath: (str), string denoting the savepath of the MAST-ML run
Returns:
None
"""
def __init__(self):
pass
[docs] def download_data(self, article_id, savepath=None):
fs = Figshare()
fs.retrieve_files_from_article(article_id)
if savepath:
try:
shutil.move(os.path.join(os.getcwd(), 'figshare_'+str(article_id)), savepath)
except shutil.Error:
print('Warning: could not move downloaded data to specified savepath, maybe because savepath is the current working directory')
return
[docs]class FoundryDatasets():
"""
Class to download datasets hosted on Materials Data Facility
Args:
no_local_server: (bool), whether or not the server is local. Set to True if running on e.g. Google Colab
anonymous: (bool), whether to use your MDF user or be anonymous. Some functionality may be disabled if True
test: (bool), whether to be in test mode. Some functionality may be disabled if True
Methods:
download_data: downloads specified data from MDF and saves to current directory
Args:
name: (str), name of the dataset to download
doi: (str), digital object identifier of the dataset to download
download: (bool), whether or not to download the full dataset
Returns:
None
"""
def __init__(self, no_local_server, anonymous, test):
self.no_local_server = no_local_server
self.anonymous = anonymous
self.test = test
self.mdf = Forge(no_local_server=self.no_local_server,
anonymous=self.anonymous,
test=self.test)
[docs] def download_data(self, name=None, doi=None, download=False):
if name is not None:
self.mdf.match_source_names(name)
elif doi is not None:
self.mdf.match_dois(doi)
else:
print('ERROR: please specify either the dataset name or DOI for lookup MDF')
result = self.mdf.search()
if len(result) == 1:
print('Successfully found the desired dataset on MDF')
print('MDF entry:')
pprint(result)
if download == True:
print('Downloading dataset from MDF')
self.mdf.globus_download(results=result)
return
[docs]class MatminerDatasets():
"""
Class to download datasets hosted from the Matminer package's Figshare page. A summary of available datasets
can be found at: https://hackingmaterials.lbl.gov/matminer/dataset_summary.html
Args:
None
Methods:
download_data: downloads specified data from Matminer/Figshare and saves to current directory
Args:
name: (str), name of the dataset to download. For compatible names, call get_available_datasets
save_data: (bool), whether to save the downloaded data to the current working directory
Returns:
df: (dataframe), dataframe of downloaded data
get_available_datasets: returns information on the available dataset names and details one can downlaod
Args:
None.
Returns:
None.
"""
def __init__(self):
pass
[docs] def download_data(self, name, save_data=True):
df = load_dataset(name=name)
if save_data == True:
df.to_excel(name+'.xlsx', index=False)
with open('%s.pickle' % name, 'wb') as data_file:
pickle.dump(df, data_file)
return df
[docs] def get_available_datasets(self):
datasets = get_available_datasets()
return