"""
This module contains routines to set up and manage the metadata for a MAST-ML run
Mastml:
Class to set up directories for saving the output of a MAST-ML run, and for constructing and updating a
metadata summary file.
"""
import os
from datetime import datetime
from collections import OrderedDict
import json
from pathos.multiprocessing import ProcessingPool as Pool
from functools import partial
[docs]class Mastml():
"""
Main helper class to initialize mastml runs and create and manage run metadata
Args:
savepath: (str), string specifing the savepath name for the mastml run
mastml_metdata: (dict), dict of mastml metadata. If none, a new dict will be created
Methods:
_initialize_run: initializes run by making new metadata file or updating existing one, and initializing the output directory.
Args:
None
Returns:
None
_initialize_output: creates the output folder based on specified savepath and datetime information
Args:
None
Returns:
None
_initialize_metadata: creates a new metadata file and saves the savepath info to it
Args:
None
Returns:
None
_update_metadata: placeholder for updating the metadata file with new run information
Args:
None
Returns:
None
_save_mastml_metadata: saves the metadata dict as a json file
Args:
None
Returns:
None
get_savepath: returns the savepath
Args:
None
Returns:
string specifying the savepath of the mastml run
get_mastml_metadata: returns the metadata file
Args:
None
Returns:
mastml metadata object (ordered dict)
"""
def __init__(self, savepath, mastml_metadata=None):
self.savepath = savepath
self.mastml_metadata = mastml_metadata
self._initialize_run()
def _initialize_run(self):
self._initialize_output()
if self.mastml_metadata is None:
self._initialize_metadata()
#else:
# self._update_metadata()
self._save_mastml_metadata()
def _initialize_output(self):
# Make an output folder for the run to store all data to
if os.path.exists(self.savepath):
try:
os.rmdir(self.savepath) # succeeds if empty
except OSError: # directory not empty
print(f"{self.savepath} not empty. Renaming...")
now = datetime.now()
self.savepath = self.savepath.rstrip(os.sep) # remove trailing slash
self.savepath = f"{self.savepath}_{now.year:02d}_{now.month:02d}_{now.day:02d}" \
f"_{now.hour:02d}_{now.minute:02d}_{now.second:02d}"
os.makedirs(self.savepath)
return
def _initialize_metadata(self):
self.mastml_metadata = OrderedDict()
self.mastml_metadata['savepath'] = self.savepath
return
def _update_metadata(self,
outerdir,
split_name,
model=None,
splitter=None,
preprocessor=None,
selector=None,
hyperopt=None,
train_stats=None,
test_stats=None,
leaveout_stats=None,
X_train=None,
X_test=None,
X_leaveout=None,
X_extra_train=None,
X_extra_test=None,
X_extra_leaveout=None,
y_train=None,
y_test=None,
y_test_domain=None,
y_leaveout=None,
y_pred_train=None,
y_pred=None,
y_pred_leaveout=None,
residuals_train=None,
residuals_test=None,
residuals_leaveout=None,
model_errors_train=None,
model_errors_test=None,
model_errors_leaveout=None,
model_errors_train_cal=None,
model_errors_test_cal=None,
model_errors_leaveout_cal=None,
dataset_stdev=None):
# Update with new entry: (1) module, (2) class, (3) path executed, (4) paths to data used ???
if outerdir not in self.mastml_metadata.keys():
self.mastml_metadata[outerdir] = OrderedDict()
if split_name not in self.mastml_metadata[outerdir].keys():
self.mastml_metadata[outerdir][split_name] = OrderedDict()
if split_name == 'split_outer_dir':
self.mastml_metadata[outerdir][split_name]['splitdir'] = outerdir
else:
self.mastml_metadata[outerdir][split_name]['splitdir'] = split_name
if model is not None:
try:
model_name = model.model.__class__.__name__
except:
model_name = model.__class__.__name__
self.mastml_metadata[outerdir][split_name]['model'] = model_name
if split_name == 'split_summary':
self.mastml_metadata[outerdir][split_name]['model_path'] = os.path.join(os.path.join(self.savepath, outerdir), model_name+'.pkl')
elif split_name == 'split_outer_summary':
self.mastml_metadata[outerdir][split_name]['model_path'] = os.path.join(outerdir, model_name+'.pkl')
else:
self.mastml_metadata[outerdir][split_name]['model_path'] = os.path.join(os.path.join(os.path.join(self.savepath, outerdir), split_name), model_name + '.pkl')
if splitter is not None:
self.mastml_metadata[outerdir][split_name]['splitter'] = splitter.splitter.__class__.__name__
if preprocessor is not None:
self.mastml_metadata[outerdir][split_name]['preprocessor'] = preprocessor.__class__.__name__
if selector is not None:
self.mastml_metadata[outerdir][split_name]['selector'] = selector.__class__.__name__
if hyperopt is not None:
self.mastml_metadata[outerdir][split_name]['hyperopt'] = hyperopt.__class__.__name__
if train_stats is not None:
self.mastml_metadata[outerdir][split_name]['train_stats'] = train_stats.to_json()
if test_stats is not None:
self.mastml_metadata[outerdir][split_name]['test_stats'] = test_stats.to_json() #to_dict
if leaveout_stats is not None:
self.mastml_metadata[outerdir][split_name]['leaveout_stats'] = leaveout_stats.to_json()
if X_train is not None:
self.mastml_metadata[outerdir][split_name]['train_columns'] = X_train.columns.tolist()
self.mastml_metadata[outerdir][split_name]['X_train'] = X_train.to_json()
if X_test is not None:
self.mastml_metadata[outerdir][split_name]['X_test'] = X_test.to_json()
if X_leaveout is not None:
self.mastml_metadata[outerdir][split_name]['X_leaveout'] = X_leaveout.to_json()
if X_extra_train is not None:
self.mastml_metadata[outerdir][split_name]['X_extra_train'] = X_extra_train.to_json()
if X_extra_test is not None:
self.mastml_metadata[outerdir][split_name]['X_extra_test'] = X_extra_test.to_json()
if X_extra_leaveout is not None:
self.mastml_metadata[outerdir][split_name]['X_extra_leaveout'] = X_extra_leaveout.to_json()
if y_train is not None:
self.mastml_metadata[outerdir][split_name]['y_train'] = y_train.to_json()
if y_test is not None:
self.mastml_metadata[outerdir][split_name]['y_test'] = y_test.to_json()
if y_leaveout is not None:
self.mastml_metadata[outerdir][split_name]['y_leaveout'] = y_leaveout.to_json()
if y_pred_train is not None:
self.mastml_metadata[outerdir][split_name]['y_pred_train'] = y_pred_train.to_json()
if y_pred is not None:
self.mastml_metadata[outerdir][split_name]['y_pred'] = y_pred.to_json()
if y_pred_leaveout is not None:
self.mastml_metadata[outerdir][split_name]['y_pred_leaveout'] = y_pred_leaveout.to_json()
if y_test_domain is not None:
self.mastml_metadata[outerdir][split_name]['y_test_domain'] = y_test_domain.to_json()
if residuals_train is not None:
self.mastml_metadata[outerdir][split_name]['residuals_train'] = residuals_train.to_json()
if residuals_test is not None:
self.mastml_metadata[outerdir][split_name]['residuals_test'] = residuals_test.to_json()
if residuals_leaveout is not None:
self.mastml_metadata[outerdir][split_name]['residuals_leaveout'] = residuals_leaveout.to_json()
if model_errors_train is not None:
self.mastml_metadata[outerdir][split_name]['model_errors_train'] = model_errors_train.to_json()
if model_errors_test is not None:
self.mastml_metadata[outerdir][split_name]['model_errors_test'] = model_errors_test.to_json()
if model_errors_leaveout is not None:
self.mastml_metadata[outerdir][split_name]['model_errors_leaveout'] = model_errors_leaveout.to_json()
if model_errors_train_cal is not None:
self.mastml_metadata[outerdir][split_name]['model_errors_train_cal'] = model_errors_train_cal.to_json()
if model_errors_test_cal is not None:
self.mastml_metadata[outerdir][split_name]['model_errors_test_cal'] = model_errors_test_cal.to_json()
if model_errors_leaveout_cal is not None:
self.mastml_metadata[outerdir][split_name]['model_errors_leaveout_cal'] = model_errors_leaveout_cal.to_json()
if dataset_stdev is not None:
self.mastml_metadata[outerdir][split_name]['dataset_stdev'] = dataset_stdev
return
def _save_mastml_metadata(self):
with open(os.path.join(self.savepath, 'mastml_metadata.json'), 'w') as f:
json.dump(self.mastml_metadata, f)
return
@property
def get_savepath(self):
return self.savepath
@property
def get_mastml_metadata(self):
return self.mastml_metadata
[docs]def parallel(func, x, *args, **kwargs):
'''
Run some function in parallel.
inputs:
func = The function to apply.
x = The list of items to apply function on.
outputs:
data = List of items returned by func.
'''
pool = Pool(os.cpu_count())
part_func = partial(func, *args, **kwargs)
with Pool(os.cpu_count()) as pool:
data = list(pool.imap(part_func, x))
return data
[docs]def write_requirements():
os.system("pip freeze > reqs_all.txt")
reqs_exact = list()
with open('reqs_all.txt', 'r') as f:
lines = f.readlines()
for line in lines:
reqs_exact.append(line.strip())
reqs = ['matplotlib',
'numpy',
'pandas',
'pymatgen',
'scikit-learn',
'mastml']
with open('requirements.txt', 'w') as f:
for req in reqs:
for req_exact in reqs_exact:
if req == req_exact.split('==')[0]:
f.write(req+'\n')
return