"""
This module contains a collection of classes for generating input features to fit machine learning models to.
"""
import multiprocessing
import os
import logging
import re
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PolynomialFeatures as SklearnPolynomialFeatures
try:
from pymatgen import Element, Composition
from pymatgen.ext.matproj import MPRester
from pymatgen.io.vasp.inputs import Poscar
except:
print('Error with importing pymatgen, try re-installing and try again')
# matminer class imports
import inspect # used to get a dictionary of classes in a module
try:
from matminer.featurizers import structure as struc
from matminer.data_retrieval.retrieve_Citrine import CitrineDataRetrieval
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
from matminer.data_retrieval.retrieve_MDF import MDFDataRetrieval
from matminer.data_retrieval.retrieve_MPDS import MPDSDataRetrieval
from matminer.data_retrieval.retrieve_AFLOW import AFLOWDataRetrieval
except:
print('Error with importing matminer, try re-installing and try again')
# locate path to directory containing AtomicNumber.table, AtomicRadii.table AtomicVolume.table, etc
# (needs to do it the hard way becuase python -m sets cwd to wherever python is ran from)
import mastml
from mastml import utils
log = logging.getLogger('mastml')
MAGPIE_DATA_PATH = os.path.join(mastml.__path__[0], 'magpie')
[docs]class PolynomialFeatures(BaseEstimator, TransformerMixin):
"""
Class to generate polynomial features using scikit-learn's polynomial features method
More info at: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
Args:
degree: (int), degree of polynomial features
interaction_only: (bool), If true, only interaction features are produced: features that are products of at
most degree distinct input features (so not x[1] ** 2, x[0] * x[2] ** 3, etc.).
include_bias: (bool),If True (default), then include a bias column, the feature in which all polynomial powers
are zero (i.e. a column of ones - acts as an intercept term in a linear model).
Methods:
fit: conducts fit method of polynomial feature generation
Args:
df: (dataframe), dataframe of input X and y data
transform: generates dataframe containing polynomial features
Args:
df: (dataframe), dataframe of input X and y data
Returns:
(dataframe), dataframe containing new polynomial features, plus original features present
"""
def __init__(self, features=None, degree=2, interaction_only=False, include_bias=True):
self.features = features
self.SPF = SklearnPolynomialFeatures(degree, interaction_only, include_bias)
[docs] def fit(self, df, y=None):
if self.features is None:
self.features = df.columns
array = df[self.features].values
self.SPF.fit(array)
return self
[docs]class ContainsElement(BaseEstimator, TransformerMixin):
"""
Class to generate new categorical features (i.e. values of 1 or 0) based on whether an input composition contains a
certain designated element
Args:
composition_feature: (str), string denoting a chemical composition to generate elemental features from
element: (str), string representing the name of an element
new_name: (str), the name of the new feature column to be generated
all_elments: (bool), whether to generate new features for all elements present from all compositions in the dataset.
Methods:
fit: pass through, needed to maintain scikit-learn class structure
Args:
df: (dataframe), dataframe of input X and y data
transform: generate new element-specific features
Args:
df: (dataframe), dataframe of input X and y data
Returns:
df_trans: (dataframe), dataframe with generated element-specific features
"""
def __init__(self, composition_feature, element, new_name, all_elements=False):
self.composition_feature = composition_feature
self.element = element
self.new_column_name = new_name #f'has_{self.element}'
self.all_elements = all_elements
[docs] def fit(self, df, y=None):
return self
def _contains_element(self, comp):
"""
Returns 1 if comp contains that element, and 0 if not.
Uses ints because sklearn and numpy like number classes better than bools. Could even be
something crazy like "contains {element}" and "does not contain {element}" if you really
wanted.
"""
comp = Composition(comp)
count = comp[self.element]
return int(count != 0)
def _contains_all_elements(self, compositions):
elements = list()
df_trans = pd.DataFrame()
for comp in compositions.values:
comp = Composition(comp)
for element in comp.elements:
if element not in elements:
elements.append(element)
for element in elements:
self.element = element
self.new_column_name = "has_"+str(self.element)
has_element = compositions.apply(self._contains_element)
df_trans[self.new_column_name] = has_element
return df_trans
[docs]class Magpie(BaseEstimator, TransformerMixin):
"""
Class that wraps MagpieFeatureGeneration, giving it scikit-learn structure
Args:
composition_feature: (str), string denoting a chemical composition to generate elemental features from
Methods:
fit: pass through, copies input columns as pre-generated features
Args:
df: (dataframe), input dataframe containing X and y data
transform: generate Magpie features
Args:
df: (dataframe), input dataframe containing X and y data
Returns:
df: (dataframe), output dataframe containing generated features, original features and y data
"""
def __init__(self, composition_feature, feature_types=None):
self.composition_feature = composition_feature
self.feature_types = feature_types
if self.feature_types is None:
self.feature_types = ['composition_avg', 'arithmetic_avg', 'max', 'min', 'difference', 'elements']
[docs] def fit(self, df, y=None):
self.original_features = df.columns
return self
[docs]class MaterialsProject(BaseEstimator, TransformerMixin):
"""
Class that wraps MaterialsProjectFeatureGeneration, giving it scikit-learn structure
Args:
composition_feature: (str), string denoting a chemical composition to generate elemental features from
mapi_key: (str), string denoting your Materials Project API key
Methods:
fit: pass through, copies input columns as pre-generated features
Args:
df: (dataframe), input dataframe containing X and y data
transform: generate Materials Project features
Args:
df: (dataframe), input dataframe containing X and y data
Returns:
df: (dataframe), output dataframe containing generated features, original features and y data
"""
def __init__(self, composition_feature, api_key):
self.composition_feature = composition_feature
self.api_key = api_key
[docs] def fit(self, df, y=None):
self.original_features = df.columns
return self
[docs]class Matminer(BaseEstimator, TransformerMixin):
"""
Class to generate structural features from matminer structure module
Args:
structural_features: the structure feature(s) the user wants to instantiate and generate
structure_col: the dataframe column that contains the pymatgen structure object. Matminer needs a pymatgen
structure object in order to instantiate the structural feature
Methods:
fit: pass through, needed to maintain scikit-learn class structure
Args:
df: (dataframe), dataframe of input x and y data
transform: main method that iterates through rows of dataframe to create pymatgen structure objects for
matminer routines. Iterates through list of structural features from conf file and instantiates each structure;
drops unused dataframe columns and returns the generated features dataframe
Args:
df: (dataframe), dataframe containing the path of file to create pymatgen structure object which is under the structure_col
column
Returns:
(dataframe), the generated features dataframe
"""
def __init__(self, structural_features, structure_col): # _instantiate only needs this
# assuming dataframe is coming in with a column 'Structure' with coords.
# where do I need to raise errors
if type(structural_features) is str:
structural_features = [structural_features]
structural_features = structural_features # structural feature is now cast as a list
self.structural_features = structural_features # structural feature field of class
self.structure_col = structure_col
[docs] def fit(self, df, y=None):
return self
[docs] def retrieve_mp(self, criteria, properties=["band_gap", "volume", "density", "formation_energy_per_atom"],
index_mpid=True, api_key=None):
"""
Gets data from MP in a dataframe format. See api_link for more details.
Args:
criteria (dict): (str/dict) see MPRester.query() for a description of this
parameter. String examples: "mp-1234", "Fe2O3", "Li-Fe-O',
"\\*2O3". Dict example: {"band_gap": {"$gt": 1}}
properties ([str]): (list) see MPRester.query() for a description of this
parameter. Example: ["formula", "formation_energy_per_atom"]
plus: "structure", "initial_structure", "final_structure",
"bandstructure" (line mode), "bandstructure_uniform",
"phonon_bandstructure", "phonon_ddb", "phonon_bandstructure",
"phonon_dos". Note that for a long list of compounds, it may
take a long time to retrieve some of these objects.
index_mpid (bool): (bool) Whether to set the materials_id as the dataframe
index.
api_key: (str) Your Materials Project API key, or None if you've
set up your pymatgen config.
Returns (pandas.Dataframe): containing results
notes/bugs: works pretty great, API easy to use and accurate. What to fix for
dataframe integration into mastml?
"""
mp_df = MPDataRetrieval(api_key).get_dataframe(criteria, properties, index_mpid)
mp_df = mp_df.loc[mp_df['formation_energy_per_atom'].idxmin(), :].to_frame().transpose().reset_index().drop(
'index', axis=1)
return mp_df
[docs] def retrieve_citrine(self, criteria, properties, common_fields, secondary_fields, print_properties_options,
api_key):
"""
Gets a Pandas dataframe object from data retrieved from
the Citrine API.
Args:
criteria (dict): see get_data method for supported keys except
prop; prop should be included in properties.
properties ([str]): requested properties/fields/columns.
For example, ["Seebeck coefficient", "Band gap"]. If unsure
about the exact words, capitalization, etc try something like
["gap"] and "max_results": 3 and print_properties_options=True
to see the exact options for this field
common_fields ([str]): fields that are common to all the requested
properties. Common example can be "chemicalFormula". Look for
suggested common fields after a quick query for more info
secondary_fields (bool): if True, fields not included in properties
may be added to the output (e.g. references). Recommended only
if len(properties)==1'
print_properties_options (bool): whether to print available options
for "properties" and "common_fields" arguments.
api_key: (str) Your Citrine API key, or None if
you've set the CITRINE_KEY environment variable
return: (object) Pandas dataframe object containing the results
notes/bugs: criteria needs a dictionary, not specified in get_data() as mentioned,
and example on documentation webpage does not work. What to fix for
dataframe integration into mastml?
"""
citrine_df = CitrineDataRetrieval(api_key).get_dataframe(criteria, properties, common_fields, secondary_fields,
print_properties_options)
return citrine_df
[docs] def retrieve_MDF(self, criteria, anonymous=False, properties=None, unwind_arrays=True):
mdf_df = MDFDataRetrieval(anonymous).get_dataframe(criteria, properties, unwind_arrays)
return mdf_df
[docs] def retrieve_MPDS(self, criteria, properties=None, api_key=None, endpoint=None):
mpds_df = MPDSDataRetrieval(api_key, endpoint).get_dataframe(criteria, properties)
return mpds_df
[docs] def retrieve_AFLOW(self, criteria, properties, files=None, request_size=10000, request_limit=0, index_auid=True):
aflow_df = AFLOWDataRetrieval().get_dataframe(criteria, properties, files, request_size, index_auid)
return aflow_df
[docs]class NoGenerate(BaseEstimator, TransformerMixin):
"""
Class for having a "null" transform where the output is the same as the input. Needed by MAST-ML as a placeholder if
certain workflow aspects are not performed.
Args:
None
Methods:
fit: does nothing, just returns object instance. Needed to maintain same structure as scikit-learn classes
Args:
X: (dataframe), dataframe of X features
transform: passes the input back out, in this case the array of X features
Args:
X: (dataframe), dataframe of X features
Returns:
(dataframe), dataframe of X features
"""
def __init__(self):
pass
[docs] def fit(self, X, y=None):
return self
name_to_constructor = {
'DoNothing': NoGenerate,
'PolynomialFeatures': PolynomialFeatures,
'Magpie': Magpie,
'Matminer': Matminer,
'MaterialsProject': MaterialsProject,
'ContainsElement': ContainsElement,
}
[docs]def clean_dataframe(df):
"""
Method to clean dataframes after feature generation has occurred, to remove columns that have a single missing or
NaN value, or remove a row that is fully empty
Args:
df: (dataframe), a post feature generation dataframe that needs cleaning
Returns:
df: (dataframe), the cleaned dataframe
"""
df = df.apply(pd.to_numeric, errors='coerce') # convert non-number to NaN
# warn on empty rows
before_count = df.shape[0]
df = df.dropna(axis=0, how='all')
lost_count = before_count - df.shape[0]
if lost_count > 0:
log.warning(f'Dropping {lost_count}/{before_count} rows for being totally empty')
# drop columns with any empty cells
before_count = df.shape[1]
df = df.select_dtypes(['number']).dropna(axis=1)
lost_count = before_count - df.shape[1]
if lost_count > 0:
log.warning(f'Dropping {lost_count}/{before_count} generated columns due to missing values')
return df
[docs]class MagpieFeatureGeneration(object):
"""
Class to generate new features using Magpie data and dataframe containing material compositions
Args:
dataframe: (pandas dataframe), dataframe containing x and y data and feature names
composition_feature: (str), string denoting a chemical composition to generate elemental features from
feature_types: (list), list containing types of magpie features to include in the final dataframe. Options
include ["composition_avg", "arithmetic_avg", "max", "min", "difference", "elements"]. Specifying nothing will
include all features.
Methods:
generate_magpie_features : generates magpie feature set based on compositions in dataframe
Args:
None
Returns:
dataframe: (dataframe) : dataframe containing magpie feature set
"""
def __init__(self, dataframe, composition_feature, feature_types):
self.dataframe = dataframe
self.composition_feature = composition_feature
self.feature_types = feature_types
[docs] def generate_magpie_features(self):
# Replace empty composition fields with empty string instead of NaN
self.dataframe = self.dataframe.fillna('')
compositions_raw = self.dataframe[self.composition_feature].tolist()
# Check first entry of comps to find [] for delimiting different sublattices
has_sublattices = False
if '[' in compositions_raw[0]:
if ']' in compositions_raw[0]:
has_sublattices = True
log.info('MAGPIE feature generation found brackets in material compositions denoting specific sublattices!')
# Parse raw composition strings with brackets to denote compositions of different sublattices
site_dict_list = list()
for comp in compositions_raw:
sites = re.findall(r"\[([A-Za-z0-9_.]+)\]", comp)
site_dict = dict()
for i, site in enumerate(sites):
comp_by_site = Composition(site).as_dict()
site_dict['Site'+str(i+1)] = comp_by_site
site_dict_list.append(site_dict)
compositions = list()
if has_sublattices == True:
# Parse out brackets from compositions
for comp in compositions_raw:
comp_split = comp.split('[')
comp_str = ''
for s in comp_split:
comp_str += s
comp_split = comp_str.split(']')
comp_str = ''
for s in comp_split:
comp_str += s
compositions.append(comp_str)
else:
compositions = compositions_raw
if len(compositions) < 1:
raise utils.MissingColumnError('Error! No material compositions column found in your input data file. To use this feature generation routine, you must supply a material composition for each data point')
# Add the column of combined material compositions into the dataframe
self.dataframe[self.composition_feature] = compositions
# Assign each magpiedata feature set to appropriate composition name
magpiedata_dict_composition_average = {}
magpiedata_dict_arithmetic_average = {}
magpiedata_dict_max = {}
magpiedata_dict_min = {}
magpiedata_dict_difference = {}
magpiedata_dict_atomic_bysite = {}
magpiedata_dict_composition_average_site1 = {}
magpiedata_dict_arithmetic_average_site1 = {}
magpiedata_dict_max_site1 = {}
magpiedata_dict_min_site1 = {}
magpiedata_dict_difference_site1 = {}
magpiedata_dict_composition_average_site2 = {}
magpiedata_dict_arithmetic_average_site2 = {}
magpiedata_dict_max_site2 = {}
magpiedata_dict_min_site2 = {}
magpiedata_dict_difference_site2 = {}
magpiedata_dict_composition_average_site3 = {}
magpiedata_dict_arithmetic_average_site3 = {}
magpiedata_dict_max_site3 = {}
magpiedata_dict_min_site3 = {}
magpiedata_dict_difference_site3 = {}
magpiedata_dict_composition_average_site1site2 = {}
magpiedata_dict_arithmetic_average_site1site2 = {}
magpiedata_dict_max_site1site2 = {}
magpiedata_dict_min_site1site2 = {}
magpiedata_dict_difference_site1site2 = {}
magpiedata_dict_composition_average_site1site3 = {}
magpiedata_dict_arithmetic_average_site1site3 = {}
magpiedata_dict_max_site1site3 = {}
magpiedata_dict_min_site1site3 = {}
magpiedata_dict_difference_site1site3 = {}
magpiedata_dict_composition_average_site2site3 = {}
magpiedata_dict_arithmetic_average_site2site3 = {}
magpiedata_dict_max_site2site3 = {}
magpiedata_dict_min_site2site3 = {}
magpiedata_dict_difference_site2site3 = {}
for i, composition in enumerate(compositions):
if has_sublattices:
magpiedata_collected = self._get_computed_magpie_features(composition=composition, data_path=MAGPIE_DATA_PATH, site_dict=site_dict_list[i])
else:
magpiedata_collected = self._get_computed_magpie_features(composition=composition,data_path=MAGPIE_DATA_PATH, site_dict=None)
magpiedata_atomic_notparsed = self._get_atomic_magpie_features(composition=composition, data_path=MAGPIE_DATA_PATH)
if has_sublattices:
number_sites = len(site_dict_list[i].keys())
if number_sites == 1:
magpiedata_composition_average = magpiedata_collected[0]
magpiedata_arithmetic_average = magpiedata_collected[1]
magpiedata_max = magpiedata_collected[2]
magpiedata_min = magpiedata_collected[3]
magpiedata_difference = magpiedata_collected[4]
magpiedata_composition_average_site1 = magpiedata_collected[5]
magpiedata_arithmetic_average_site1 = magpiedata_collected[6]
magpiedata_max_site1 = magpiedata_collected[7]
magpiedata_min_site1 = magpiedata_collected[8]
magpiedata_difference_site1 = magpiedata_collected[9]
magpiedata_dict_composition_average[composition] = magpiedata_composition_average
magpiedata_dict_arithmetic_average[composition] = magpiedata_arithmetic_average
magpiedata_dict_max[composition] = magpiedata_max
magpiedata_dict_min[composition] = magpiedata_min
magpiedata_dict_difference[composition] = magpiedata_difference
magpiedata_dict_composition_average_site1[composition] = magpiedata_composition_average_site1
magpiedata_dict_arithmetic_average_site1[composition] = magpiedata_arithmetic_average_site1
magpiedata_dict_max_site1[composition] = magpiedata_max_site1
magpiedata_dict_min_site1[composition] = magpiedata_min_site1
magpiedata_dict_difference_site1[composition] = magpiedata_difference_site1
elif number_sites == 2:
magpiedata_composition_average = magpiedata_collected[0]
magpiedata_arithmetic_average = magpiedata_collected[1]
magpiedata_max = magpiedata_collected[2]
magpiedata_min = magpiedata_collected[3]
magpiedata_difference = magpiedata_collected[4]
magpiedata_composition_average_site1 = magpiedata_collected[5]
magpiedata_arithmetic_average_site1 = magpiedata_collected[6]
magpiedata_max_site1 = magpiedata_collected[7]
magpiedata_min_site1 = magpiedata_collected[8]
magpiedata_difference_site1 = magpiedata_collected[9]
magpiedata_composition_average_site2 = magpiedata_collected[10]
magpiedata_arithmetic_average_site2 = magpiedata_collected[11]
magpiedata_max_site2 = magpiedata_collected[12]
magpiedata_min_site2 = magpiedata_collected[13]
magpiedata_difference_site2 = magpiedata_collected[14]
magpiedata_dict_composition_average[composition] = magpiedata_composition_average
magpiedata_dict_arithmetic_average[composition] = magpiedata_arithmetic_average
magpiedata_dict_max[composition] = magpiedata_max
magpiedata_dict_min[composition] = magpiedata_min
magpiedata_dict_difference[composition] = magpiedata_difference
magpiedata_dict_composition_average_site1[composition] = magpiedata_composition_average_site1
magpiedata_dict_arithmetic_average_site1[composition] = magpiedata_arithmetic_average_site1
magpiedata_dict_max_site1[composition] = magpiedata_max_site1
magpiedata_dict_min_site1[composition] = magpiedata_min_site1
magpiedata_dict_difference_site1[composition] = magpiedata_difference_site1
magpiedata_dict_composition_average_site2[composition] = magpiedata_composition_average_site2
magpiedata_dict_arithmetic_average_site2[composition] = magpiedata_arithmetic_average_site2
magpiedata_dict_max_site2[composition] = magpiedata_max_site2
magpiedata_dict_min_site2[composition] = magpiedata_min_site2
magpiedata_dict_difference_site2[composition] = magpiedata_difference_site2
elif number_sites == 3:
magpiedata_composition_average = magpiedata_collected[0]
magpiedata_arithmetic_average = magpiedata_collected[1]
magpiedata_max = magpiedata_collected[2]
magpiedata_min = magpiedata_collected[3]
magpiedata_difference = magpiedata_collected[4]
magpiedata_composition_average_site1 = magpiedata_collected[5]
magpiedata_arithmetic_average_site1 = magpiedata_collected[6]
magpiedata_max_site1 = magpiedata_collected[7]
magpiedata_min_site1 = magpiedata_collected[8]
magpiedata_difference_site1 = magpiedata_collected[9]
magpiedata_composition_average_site2 = magpiedata_collected[10]
magpiedata_arithmetic_average_site2 = magpiedata_collected[11]
magpiedata_max_site2 = magpiedata_collected[12]
magpiedata_min_site2 = magpiedata_collected[13]
magpiedata_difference_site2 = magpiedata_collected[14]
magpiedata_composition_average_site3 = magpiedata_collected[15]
magpiedata_arithmetic_average_site3 = magpiedata_collected[16]
magpiedata_max_site3 = magpiedata_collected[17]
magpiedata_min_site3 = magpiedata_collected[18]
magpiedata_difference_site3 = magpiedata_collected[19]
# Couplings between sites
magpiedata_composition_average_site1site2 = magpiedata_collected[20]
magpiedata_arithmetic_average_site1site2 = magpiedata_collected[21]
magpiedata_difference_site1site2 = magpiedata_collected[22]
magpiedata_composition_average_site1site3 = magpiedata_collected[23]
magpiedata_arithmetic_average_site1site3 = magpiedata_collected[24]
magpiedata_difference_site1site3 = magpiedata_collected[25]
magpiedata_composition_average_site2site3 = magpiedata_collected[26]
magpiedata_arithmetic_average_site2site3 = magpiedata_collected[27]
magpiedata_difference_site2site3 = magpiedata_collected[28]
magpiedata_dict_composition_average[composition] = magpiedata_composition_average
magpiedata_dict_arithmetic_average[composition] = magpiedata_arithmetic_average
magpiedata_dict_max[composition] = magpiedata_max
magpiedata_dict_min[composition] = magpiedata_min
magpiedata_dict_difference[composition] = magpiedata_difference
magpiedata_dict_composition_average_site1[composition] = magpiedata_composition_average_site1
magpiedata_dict_arithmetic_average_site1[composition] = magpiedata_arithmetic_average_site1
magpiedata_dict_max_site1[composition] = magpiedata_max_site1
magpiedata_dict_min_site1[composition] = magpiedata_min_site1
magpiedata_dict_difference_site1[composition] = magpiedata_difference_site1
magpiedata_dict_composition_average_site2[composition] = magpiedata_composition_average_site2
magpiedata_dict_arithmetic_average_site2[composition] = magpiedata_arithmetic_average_site2
magpiedata_dict_max_site2[composition] = magpiedata_max_site2
magpiedata_dict_min_site2[composition] = magpiedata_min_site2
magpiedata_dict_difference_site2[composition] = magpiedata_difference_site2
magpiedata_dict_composition_average_site3[composition] = magpiedata_composition_average_site3
magpiedata_dict_arithmetic_average_site3[composition] = magpiedata_arithmetic_average_site3
magpiedata_dict_max_site3[composition] = magpiedata_max_site3
magpiedata_dict_min_site3[composition] = magpiedata_min_site3
magpiedata_dict_difference_site3[composition] = magpiedata_difference_site3
# Site1+Site2 coupling
magpiedata_dict_composition_average_site1site2[composition] = magpiedata_composition_average_site1site2
magpiedata_dict_arithmetic_average_site1site2[composition] = magpiedata_arithmetic_average_site1site2
magpiedata_dict_difference_site1site2[composition] = magpiedata_difference_site1site2
# Site1+Site3 coupling
magpiedata_dict_composition_average_site1site3[composition] = magpiedata_composition_average_site1site3
magpiedata_dict_arithmetic_average_site1site3[composition] = magpiedata_arithmetic_average_site1site3
magpiedata_dict_difference_site1site3[composition] = magpiedata_difference_site1site3
# Site2+Site3 coupling
magpiedata_dict_composition_average_site2site3[composition] = magpiedata_composition_average_site2site3
magpiedata_dict_arithmetic_average_site2site3[composition] = magpiedata_arithmetic_average_site2site3
magpiedata_dict_difference_site2site3[composition] = magpiedata_difference_site2site3
else:
magpiedata_composition_average = magpiedata_collected[0]
magpiedata_arithmetic_average = magpiedata_collected[1]
magpiedata_max = magpiedata_collected[2]
magpiedata_min = magpiedata_collected[3]
magpiedata_difference = magpiedata_collected[4]
magpiedata_dict_composition_average[composition] = magpiedata_composition_average
magpiedata_dict_arithmetic_average[composition] = magpiedata_arithmetic_average
magpiedata_dict_max[composition] = magpiedata_max
magpiedata_dict_min[composition] = magpiedata_min
magpiedata_dict_difference[composition] = magpiedata_difference
count = 1
magpiedata_atomic_bysite = {}
# Also include magpie features of individual elements in the material
for entry in magpiedata_atomic_notparsed:
for magpiefeature, featurevalue in magpiedata_atomic_notparsed[entry].items():
magpiedata_atomic_bysite["Element"+str(count)+"_"+str(magpiefeature)] = featurevalue
count += 1
magpiedata_dict_atomic_bysite[composition] = magpiedata_atomic_bysite
if has_sublattices:
if number_sites == 1:
magpiedata_dict_list = [magpiedata_dict_composition_average, magpiedata_dict_arithmetic_average,
magpiedata_dict_max, magpiedata_dict_min, magpiedata_dict_difference, magpiedata_dict_atomic_bysite,
magpiedata_dict_composition_average_site1, magpiedata_dict_arithmetic_average_site1,
magpiedata_dict_max_site1, magpiedata_dict_min_site1, magpiedata_dict_difference_site1]
elif number_sites == 2:
magpiedata_dict_list = [magpiedata_dict_composition_average, magpiedata_dict_arithmetic_average,
magpiedata_dict_max, magpiedata_dict_min, magpiedata_dict_difference, magpiedata_dict_atomic_bysite,
magpiedata_dict_composition_average_site1, magpiedata_dict_arithmetic_average_site1,
magpiedata_dict_max_site1, magpiedata_dict_min_site1, magpiedata_dict_difference_site1,
magpiedata_dict_composition_average_site2, magpiedata_dict_arithmetic_average_site2,
magpiedata_dict_max_site2, magpiedata_dict_min_site2, magpiedata_dict_difference_site2]
elif number_sites == 3:
magpiedata_dict_list = [magpiedata_dict_composition_average, magpiedata_dict_arithmetic_average,
magpiedata_dict_max, magpiedata_dict_min, magpiedata_dict_difference, magpiedata_dict_atomic_bysite,
magpiedata_dict_composition_average_site1, magpiedata_dict_arithmetic_average_site1,
magpiedata_dict_max_site1, magpiedata_dict_min_site1, magpiedata_dict_difference_site1,
magpiedata_dict_composition_average_site2, magpiedata_dict_arithmetic_average_site2,
magpiedata_dict_max_site2, magpiedata_dict_min_site2, magpiedata_dict_difference_site2,
magpiedata_dict_composition_average_site3, magpiedata_dict_arithmetic_average_site3,
magpiedata_dict_max_site3, magpiedata_dict_min_site3, magpiedata_dict_difference_site3,
magpiedata_dict_composition_average_site1site2, magpiedata_dict_arithmetic_average_site1site2, magpiedata_dict_difference_site1site2,
magpiedata_dict_composition_average_site1site3, magpiedata_dict_arithmetic_average_site1site3, magpiedata_dict_difference_site1site3,
magpiedata_dict_composition_average_site2site3, magpiedata_dict_arithmetic_average_site2site3, magpiedata_dict_difference_site2site3]
else:
magpiedata_dict_list = [magpiedata_dict_composition_average, magpiedata_dict_arithmetic_average,
magpiedata_dict_max, magpiedata_dict_min, magpiedata_dict_difference, magpiedata_dict_atomic_bysite]
dataframe = self.dataframe
magpiedata_dict_list_toinclude = list()
if 'composition_avg' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[0])
if 'arithmetic_avg' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[1])
if 'max' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[2])
if 'min' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[3])
if 'difference' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[4])
if 'elements' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[5])
if has_sublattices is True:
if number_sites == 1:
if 'composition_avg' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[6])
if 'arithmetic_avg' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[7])
if 'max' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[8])
if 'min' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[9])
if 'difference' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[10])
if number_sites == 2:
if 'composition_avg' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[6])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[11])
if 'arithmetic_avg' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[7])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[12])
if 'max' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[8])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[13])
if 'min' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[9])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[14])
if 'difference' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[10])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[15])
if number_sites == 3:
if 'composition_avg' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[6])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[11])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[16])
if 'Site1Site2' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[21])
if 'Site1Site3' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[24])
if 'Site2Site3' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[27])
if 'arithmetic_avg' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[7])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[12])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[17])
if 'Site1Site2' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[22])
if 'Site1Site3' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[25])
if 'Site2Site3' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[28])
if 'max' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[8])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[13])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[18])
if 'min' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[9])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[14])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[19])
if 'difference' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[10])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[15])
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[20])
if 'Site1Site2' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[23])
if 'Site1Site3' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[26])
if 'Site2Site3' in self.feature_types:
magpiedata_dict_list_toinclude.append(magpiedata_dict_list[29])
for magpiedata_dict in magpiedata_dict_list_toinclude:
dataframe_magpie = pd.DataFrame.from_dict(data=magpiedata_dict, orient='index')
# Need to reorder compositions in new dataframe to match input dataframe
dataframe_magpie = dataframe_magpie.reindex(self.dataframe[self.composition_feature].tolist())
# Need to make compositions the first column, instead of the row names
dataframe_magpie.index.name = self.composition_feature
dataframe_magpie.reset_index(inplace=True)
# Merge magpie feature dataframe with originally supplied dataframe
dataframe = DataframeUtilities().merge_dataframe_columns(dataframe1=dataframe, dataframe2=dataframe_magpie)
return dataframe
def _get_computed_magpie_features(self, composition, data_path, site_dict=None):
magpiedata_composition_average = {}
magpiedata_arithmetic_average = {}
magpiedata_max = {}
magpiedata_min = {}
magpiedata_difference = {}
magpiedata_atomic = self._get_atomic_magpie_features(composition=composition, data_path=data_path)
composition = Composition(composition)
element_list, atoms_per_formula_unit = self._get_element_list(composition=composition)
# Make per-site dicts if site_dict_list specified
if site_dict:
number_sites = len(site_dict.keys())
for site, comp_dict in site_dict.items():
if site == "Site1":
num_site1_elements = int(len(site_dict[site].keys()))
site1_total = 0
for el, amt in comp_dict.items():
site1_total += amt
if site == "Site2":
num_site2_elements = int(len(site_dict[site].keys()))
site2_total = 0
for el, amt in comp_dict.items():
site2_total += amt
if site == "Site3":
num_site3_elements = int(len(site_dict[site].keys()))
site3_total = 0
for el, amt in comp_dict.items():
site3_total += amt
if number_sites == 1:
magpiedata_composition_average_site1 = {}
magpiedata_arithmetic_average_site1 = {}
magpiedata_max_site1 = {}
magpiedata_min_site1 = {}
magpiedata_difference_site1 = {}
elif number_sites == 2:
magpiedata_composition_average_site1 = {}
magpiedata_arithmetic_average_site1 = {}
magpiedata_max_site1 = {}
magpiedata_min_site1 = {}
magpiedata_difference_site1 = {}
magpiedata_composition_average_site2 = {}
magpiedata_arithmetic_average_site2 = {}
magpiedata_max_site2 = {}
magpiedata_min_site2 = {}
magpiedata_difference_site2 = {}
elif number_sites == 3:
magpiedata_composition_average_site1 = {}
magpiedata_arithmetic_average_site1 = {}
magpiedata_max_site1 = {}
magpiedata_min_site1 = {}
magpiedata_difference_site1 = {}
magpiedata_composition_average_site2 = {}
magpiedata_arithmetic_average_site2 = {}
magpiedata_max_site2 = {}
magpiedata_min_site2 = {}
magpiedata_difference_site2 = {}
magpiedata_composition_average_site3 = {}
magpiedata_arithmetic_average_site3 = {}
magpiedata_max_site3 = {}
magpiedata_min_site3 = {}
magpiedata_difference_site3 = {}
# Couplings between sites
magpiedata_composition_average_site1site2 = {}
magpiedata_arithmetic_average_site1site2 = {}
magpiedata_difference_site1site2 = {}
magpiedata_composition_average_site1site3 = {}
magpiedata_arithmetic_average_site1site3 = {}
magpiedata_difference_site1site3 = {}
magpiedata_composition_average_site2site3 = {}
magpiedata_arithmetic_average_site2site3 = {}
magpiedata_difference_site2site3 = {}
else:
log.error('MASTML currently only supports up to 3 sublattices to generate site-specific MAGPIE features. '
'Please reduce number of sublattices an re-run MASTML.')
# Initialize feature values to all be 0, because need to dynamically update them with weighted values in next loop.
for magpie_feature in magpiedata_atomic[element_list[0]]:
magpiedata_composition_average[magpie_feature] = 0
magpiedata_arithmetic_average[magpie_feature] = 0
magpiedata_max[magpie_feature] = 0
magpiedata_min[magpie_feature] = 0
magpiedata_difference[magpie_feature] = 0
if site_dict:
if number_sites == 1:
magpiedata_composition_average_site1[magpie_feature] = 0
magpiedata_arithmetic_average_site1[magpie_feature] = 0
magpiedata_max_site1[magpie_feature] = 0
magpiedata_min_site1[magpie_feature] = 0
magpiedata_difference_site1[magpie_feature] = 0
elif number_sites == 2:
magpiedata_composition_average_site1[magpie_feature] = 0
magpiedata_arithmetic_average_site1[magpie_feature] = 0
magpiedata_max_site1[magpie_feature] = 0
magpiedata_min_site1[magpie_feature] = 0
magpiedata_difference_site1[magpie_feature] = 0
magpiedata_composition_average_site2[magpie_feature] = 0
magpiedata_arithmetic_average_site2[magpie_feature] = 0
magpiedata_max_site2[magpie_feature] = 0
magpiedata_min_site2[magpie_feature] = 0
magpiedata_difference_site2[magpie_feature] = 0
elif number_sites == 3:
magpiedata_composition_average_site1[magpie_feature] = 0
magpiedata_arithmetic_average_site1[magpie_feature] = 0
magpiedata_max_site1[magpie_feature] = 0
magpiedata_min_site1[magpie_feature] = 0
magpiedata_difference_site1[magpie_feature] = 0
magpiedata_composition_average_site2[magpie_feature] = 0
magpiedata_arithmetic_average_site2[magpie_feature] = 0
magpiedata_max_site2[magpie_feature] = 0
magpiedata_min_site2[magpie_feature] = 0
magpiedata_difference_site2[magpie_feature] = 0
magpiedata_composition_average_site3[magpie_feature] = 0
magpiedata_arithmetic_average_site3[magpie_feature] = 0
magpiedata_max_site3[magpie_feature] = 0
magpiedata_min_site3[magpie_feature] = 0
magpiedata_difference_site3[magpie_feature] = 0
# Couplings between sites
magpiedata_composition_average_site1site2[magpie_feature] = 0
magpiedata_arithmetic_average_site1site2[magpie_feature] = 0
magpiedata_difference_site1site2[magpie_feature] = 0
magpiedata_composition_average_site1site3[magpie_feature] = 0
magpiedata_arithmetic_average_site1site3[magpie_feature] = 0
magpiedata_difference_site1site3[magpie_feature] = 0
magpiedata_composition_average_site2site3[magpie_feature] = 0
magpiedata_arithmetic_average_site2site3[magpie_feature] = 0
magpiedata_difference_site2site3[magpie_feature] = 0
# Original magpie feature set
for element in magpiedata_atomic:
for magpie_feature, feature_value in magpiedata_atomic[element].items():
if feature_value is not 'NaN':
# Composition average features
magpiedata_composition_average[magpie_feature] += feature_value*float(composition[element])/atoms_per_formula_unit
# Arithmetic average features
magpiedata_arithmetic_average[magpie_feature] += feature_value/len(element_list)
# Max features
if magpiedata_max[magpie_feature] > 0:
if feature_value > magpiedata_max[magpie_feature]:
magpiedata_max[magpie_feature] = feature_value
elif magpiedata_max[magpie_feature] == 0:
magpiedata_max[magpie_feature] = feature_value
# Min features
if magpiedata_min[magpie_feature] > 0:
if feature_value < magpiedata_min[magpie_feature]:
magpiedata_min[magpie_feature] = feature_value
elif magpiedata_min[magpie_feature] == 0:
magpiedata_min[magpie_feature] = feature_value
# Difference features (max - min)
magpiedata_difference[magpie_feature] = magpiedata_max[magpie_feature] - magpiedata_min[magpie_feature]
# Site-specific magpie features
if site_dict:
for element in magpiedata_atomic:
for site, comp_dict in site_dict.items():
magpie_data_by_site_collected = list()
for el, amt in comp_dict.items():
if el == element:
magpie_data_by_site_collected.append(magpiedata_atomic[element])
# Here, calc magpie values over the particular site
for magpiedata in magpie_data_by_site_collected:
for magpie_feature, feature_value in magpiedata.items():
if feature_value is not 'NaN':
if site == "Site1":
# Composition weighted average by site
magpiedata_composition_average_site1[magpie_feature] += feature_value*float(site_dict[site][element])/site1_total
# Arithmetic average by site
magpiedata_arithmetic_average_site1[magpie_feature] += feature_value / num_site1_elements
# Max features by site
if magpiedata_max_site1[magpie_feature] > 0:
if feature_value > magpiedata_max_site1[magpie_feature]:
magpiedata_max_site1[magpie_feature] = feature_value
elif magpiedata_max_site1[magpie_feature] == 0:
magpiedata_max_site1[magpie_feature] = feature_value
# Min features by site
if magpiedata_min_site1[magpie_feature] > 0:
if feature_value < magpiedata_min_site1[magpie_feature]:
magpiedata_min_site1[magpie_feature] = feature_value
elif magpiedata_min_site1[magpie_feature] == 0:
magpiedata_min_site1[magpie_feature] = feature_value
# Difference features (max - min)
magpiedata_difference_site1[magpie_feature] = magpiedata_max_site1[magpie_feature] - magpiedata_min_site1[magpie_feature]
elif site == "Site2":
# Composition weighted average by site
magpiedata_composition_average_site2[magpie_feature] += feature_value*float(site_dict[site][element])/site2_total
# Arithmetic average by site
magpiedata_arithmetic_average_site2[magpie_feature] += feature_value / num_site2_elements
# Max features by site
if magpiedata_max_site2[magpie_feature] > 0:
if feature_value > magpiedata_max_site2[magpie_feature]:
magpiedata_max_site2[magpie_feature] = feature_value
elif magpiedata_max_site2[magpie_feature] == 0:
magpiedata_max_site2[magpie_feature] = feature_value
# Min features by site
if magpiedata_min_site2[magpie_feature] > 0:
if feature_value < magpiedata_min_site2[magpie_feature]:
magpiedata_min_site2[magpie_feature] = feature_value
elif magpiedata_min_site2[magpie_feature] == 0:
magpiedata_min_site2[magpie_feature] = feature_value
# Difference features (max - min)
magpiedata_difference_site2[magpie_feature] = magpiedata_max_site2[magpie_feature] - magpiedata_min_site2[magpie_feature]
elif site == "Site3":
# Composition weighted average by site
magpiedata_composition_average_site3[magpie_feature] += feature_value*float(site_dict[site][element])/site3_total
# Arithmetic average by site
magpiedata_arithmetic_average_site3[magpie_feature] += feature_value / num_site3_elements
# Max features by site
if magpiedata_max_site3[magpie_feature] > 0:
if feature_value > magpiedata_max_site3[magpie_feature]:
magpiedata_max_site3[magpie_feature] = feature_value
elif magpiedata_max_site3[magpie_feature] == 0:
magpiedata_max_site3[magpie_feature] = feature_value
# Min features by site
if magpiedata_min_site3[magpie_feature] > 0:
if feature_value < magpiedata_min_site3[magpie_feature]:
magpiedata_min_site3[magpie_feature] = feature_value
elif magpiedata_min_site3[magpie_feature] == 0:
magpiedata_min_site3[magpie_feature] = feature_value
# Difference features (max - min)
magpiedata_difference_site3[magpie_feature] = magpiedata_max_site3[magpie_feature] - magpiedata_min_site3[magpie_feature]
# Add Site couplings here
magpiedata_composition_average_site1site2[magpie_feature] += (magpiedata_composition_average_site1[magpie_feature]+magpiedata_composition_average_site2[magpie_feature])/2
magpiedata_arithmetic_average_site1site2[magpie_feature] += (magpiedata_arithmetic_average_site1[magpie_feature]+magpiedata_arithmetic_average_site2[magpie_feature])/2
#magpiedata_difference_site1site2[magpie_feature] += abs(magpiedata_difference_site1[magpie_feature]-magpiedata_difference_site2[magpie_feature])
magpiedata_difference_site1site2[magpie_feature] += max(magpiedata_max_site1[magpie_feature], magpiedata_max_site2[magpie_feature])-min(magpiedata_min_site1[magpie_feature],magpiedata_min_site2[magpie_feature])
magpiedata_composition_average_site1site3[magpie_feature] += (magpiedata_composition_average_site1[magpie_feature]+magpiedata_composition_average_site3[magpie_feature])/2
magpiedata_arithmetic_average_site1site3[magpie_feature] += (magpiedata_arithmetic_average_site1[magpie_feature]+magpiedata_arithmetic_average_site3[magpie_feature])/2
#magpiedata_difference_site1site3[magpie_feature] += abs(magpiedata_difference_site1[magpie_feature]-magpiedata_difference_site3[magpie_feature])
magpiedata_difference_site1site3[magpie_feature] += max(magpiedata_max_site1[magpie_feature],magpiedata_max_site3[magpie_feature]) - min(magpiedata_min_site1[magpie_feature], magpiedata_min_site3[magpie_feature])
magpiedata_composition_average_site2site3[magpie_feature] += (magpiedata_composition_average_site2[magpie_feature]+magpiedata_composition_average_site3[magpie_feature])/2
magpiedata_arithmetic_average_site2site3[magpie_feature] += (magpiedata_arithmetic_average_site2[magpie_feature]+magpiedata_arithmetic_average_site3[magpie_feature])/2
#magpiedata_difference_site2site3[magpie_feature] += abs(magpiedata_difference_site2[magpie_feature]-magpiedata_difference_site3[magpie_feature])
magpiedata_difference_site2site3[magpie_feature] += max(magpiedata_max_site2[magpie_feature], magpiedata_max_site3[magpie_feature]) - min(magpiedata_min_site2[magpie_feature], magpiedata_min_site3[magpie_feature])
# Change names of features to reflect each computed type of magpie feature (max, min, etc.)
magpiedata_composition_average_renamed = {}
magpiedata_arithmetic_average_renamed = {}
magpiedata_max_renamed = {}
magpiedata_min_renamed = {}
magpiedata_difference_renamed = {}
for key in magpiedata_composition_average:
magpiedata_composition_average_renamed[key+"_composition_average"] = magpiedata_composition_average[key]
for key in magpiedata_arithmetic_average:
magpiedata_arithmetic_average_renamed[key+"_arithmetic_average"] = magpiedata_arithmetic_average[key]
for key in magpiedata_max:
magpiedata_max_renamed[key+"_max_value"] = magpiedata_max[key]
for key in magpiedata_min:
magpiedata_min_renamed[key+"_min_value"] = magpiedata_min[key]
for key in magpiedata_difference:
magpiedata_difference_renamed[key+"_difference"] = magpiedata_difference[key]
# Rename feature dicts for sublattice specific cases
magpiedata_composition_average_site1_renamed = {}
magpiedata_arithmetic_average_site1_renamed = {}
magpiedata_max_site1_renamed = {}
magpiedata_min_site1_renamed = {}
magpiedata_difference_site1_renamed = {}
magpiedata_composition_average_site2_renamed = {}
magpiedata_arithmetic_average_site2_renamed = {}
magpiedata_max_site2_renamed = {}
magpiedata_min_site2_renamed = {}
magpiedata_difference_site2_renamed = {}
magpiedata_composition_average_site3_renamed = {}
magpiedata_arithmetic_average_site3_renamed = {}
magpiedata_max_site3_renamed = {}
magpiedata_min_site3_renamed = {}
magpiedata_difference_site3_renamed = {}
# Couplings between sites
magpiedata_composition_average_site1site2_renamed = {}
magpiedata_arithmetic_average_site1site2_renamed = {}
magpiedata_difference_site1site2_renamed = {}
magpiedata_composition_average_site1site3_renamed = {}
magpiedata_arithmetic_average_site1site3_renamed = {}
magpiedata_difference_site1site3_renamed = {}
magpiedata_composition_average_site2site3_renamed = {}
magpiedata_arithmetic_average_site2site3_renamed = {}
magpiedata_difference_site2site3_renamed = {}
if site_dict:
if number_sites == 1:
for key in magpiedata_composition_average_site1:
magpiedata_composition_average_site1_renamed["Site1_"+ key + "_composition_average"] = magpiedata_composition_average_site1[key]
for key in magpiedata_arithmetic_average_site1:
magpiedata_arithmetic_average_site1_renamed["Site1_"+ key + "_arithmetic_average"] = magpiedata_arithmetic_average_site1[key]
for key in magpiedata_max_site1:
magpiedata_max_site1_renamed["Site1_"+ key + "_max_value"] = magpiedata_max_site1[key]
for key in magpiedata_min_site1:
magpiedata_min_site1_renamed["Site1_"+ key + "_min_value"] = magpiedata_min_site1[key]
for key in magpiedata_difference_site1:
magpiedata_difference_site1_renamed["Site1_"+ key + "_difference"] = magpiedata_difference_site1[key]
elif number_sites == 2:
for key in magpiedata_composition_average_site1:
magpiedata_composition_average_site1_renamed["Site1_"+ key + "_composition_average"] = magpiedata_composition_average_site1[key]
for key in magpiedata_arithmetic_average_site1:
magpiedata_arithmetic_average_site1_renamed["Site1_"+ key + "_arithmetic_average"] = magpiedata_arithmetic_average_site1[key]
for key in magpiedata_max_site1:
magpiedata_max_site1_renamed["Site1_"+ key + "_max_value"] = magpiedata_max_site1[key]
for key in magpiedata_min_site1:
magpiedata_min_site1_renamed["Site1_"+ key + "_min_value"] = magpiedata_min_site1[key]
for key in magpiedata_difference_site1:
magpiedata_difference_site1_renamed["Site1_"+ key + "_difference"] = magpiedata_difference_site1[key]
for key in magpiedata_composition_average_site2:
magpiedata_composition_average_site2_renamed["Site2_"+ key + "_composition_average"] = magpiedata_composition_average_site2[key]
for key in magpiedata_arithmetic_average_site2:
magpiedata_arithmetic_average_site2_renamed["Site2_"+ key + "_arithmetic_average"] = magpiedata_arithmetic_average_site2[key]
for key in magpiedata_max_site2:
magpiedata_max_site2_renamed["Site2_"+ key + "_max_value"] = magpiedata_max_site2[key]
for key in magpiedata_min_site2:
magpiedata_min_site2_renamed["Site2_"+ key + "_min_value"] = magpiedata_min_site2[key]
for key in magpiedata_difference_site2:
magpiedata_difference_site2_renamed["Site2_"+ key + "_difference"] = magpiedata_difference_site2[key]
elif number_sites == 3:
for key in magpiedata_composition_average_site1:
magpiedata_composition_average_site1_renamed["Site1_"+ key + "_composition_average"] = magpiedata_composition_average_site1[key]
for key in magpiedata_arithmetic_average_site1:
magpiedata_arithmetic_average_site1_renamed["Site1_"+ key + "_arithmetic_average"] = magpiedata_arithmetic_average_site1[key]
for key in magpiedata_max_site1:
magpiedata_max_site1_renamed["Site1_"+ key + "_max_value"] = magpiedata_max_site1[key]
for key in magpiedata_min_site1:
magpiedata_min_site1_renamed["Site1_"+ key + "_min_value"] = magpiedata_min_site1[key]
for key in magpiedata_difference_site1:
magpiedata_difference_site1_renamed["Site1_"+ key + "_difference"] = magpiedata_difference_site1[key]
for key in magpiedata_composition_average_site2:
magpiedata_composition_average_site2_renamed["Site2_"+ key + "_composition_average"] = magpiedata_composition_average_site2[key]
for key in magpiedata_arithmetic_average_site2:
magpiedata_arithmetic_average_site2_renamed["Site2_"+ key + "_arithmetic_average"] = magpiedata_arithmetic_average_site2[key]
for key in magpiedata_max_site2:
magpiedata_max_site2_renamed["Site2_"+ key + "_max_value"] = magpiedata_max_site2[key]
for key in magpiedata_min_site2:
magpiedata_min_site2_renamed["Site2_"+ key + "_min_value"] = magpiedata_min_site2[key]
for key in magpiedata_difference_site2:
magpiedata_difference_site2_renamed["Site2_"+ key + "_difference"] = magpiedata_difference_site2[key]
for key in magpiedata_composition_average_site3:
magpiedata_composition_average_site3_renamed["Site3_"+ key + "_composition_average"] = magpiedata_composition_average_site3[key]
for key in magpiedata_arithmetic_average_site3:
magpiedata_arithmetic_average_site3_renamed["Site3_"+ key + "_arithmetic_average"] = magpiedata_arithmetic_average_site3[key]
for key in magpiedata_max_site3:
magpiedata_max_site3_renamed["Site3_"+ key + "_max_value"] = magpiedata_max_site3[key]
for key in magpiedata_min_site1:
magpiedata_min_site3_renamed["Site3_"+ key + "_min_value"] = magpiedata_min_site3[key]
for key in magpiedata_difference_site3:
magpiedata_difference_site3_renamed["Site3_"+ key + "_difference"] = magpiedata_difference_site3[key]
# Couplings between sites
for key in magpiedata_composition_average_site1site2:
magpiedata_composition_average_site1site2_renamed["Site1Site2_"+ key + "_composition_average"] = magpiedata_composition_average_site1site2[key]
for key in magpiedata_arithmetic_average_site1site2:
magpiedata_arithmetic_average_site1site2_renamed["Site1Site2_" + key + "_arithmetic_average"] = magpiedata_arithmetic_average_site1site2[key]
for key in magpiedata_difference_site1site2:
magpiedata_difference_site1site2_renamed["Site1Site2_" + key + "_difference"] = magpiedata_difference_site1site2[key]
for key in magpiedata_composition_average_site1site3:
magpiedata_composition_average_site1site3_renamed["Site1Site3_"+ key + "_composition_average"] = magpiedata_composition_average_site1site3[key]
for key in magpiedata_arithmetic_average_site1site3:
magpiedata_arithmetic_average_site1site3_renamed["Site1Site3_" + key + "_arithmetic_average"] = magpiedata_arithmetic_average_site1site3[key]
for key in magpiedata_difference_site1site3:
magpiedata_difference_site1site3_renamed["Site1Site3_" + key + "_difference"] = magpiedata_difference_site1site3[key]
for key in magpiedata_composition_average_site2site3:
magpiedata_composition_average_site2site3_renamed["Site2Site3_"+ key + "_composition_average"] = magpiedata_composition_average_site2site3[key]
for key in magpiedata_arithmetic_average_site2site3:
magpiedata_arithmetic_average_site2site3_renamed["Site2Site3_" + key + "_arithmetic_average"] = magpiedata_arithmetic_average_site2site3[key]
for key in magpiedata_difference_site2site3:
magpiedata_difference_site2site3_renamed["Site2Site3_" + key + "_difference"] = magpiedata_difference_site2site3[key]
if site_dict:
if number_sites == 1:
return (magpiedata_composition_average_renamed, magpiedata_arithmetic_average_renamed,
magpiedata_max_renamed, magpiedata_min_renamed, magpiedata_difference_renamed,
magpiedata_composition_average_site1_renamed, magpiedata_arithmetic_average_site1_renamed,
magpiedata_max_site1_renamed, magpiedata_min_site1_renamed, magpiedata_difference_site1_renamed)
elif number_sites == 2:
return (magpiedata_composition_average_renamed, magpiedata_arithmetic_average_renamed,
magpiedata_max_renamed, magpiedata_min_renamed, magpiedata_difference_renamed,
magpiedata_composition_average_site1_renamed, magpiedata_arithmetic_average_site1_renamed,
magpiedata_max_site1_renamed, magpiedata_min_site1_renamed, magpiedata_difference_site1_renamed,
magpiedata_composition_average_site2_renamed, magpiedata_arithmetic_average_site2_renamed,
magpiedata_max_site2_renamed, magpiedata_min_site2_renamed, magpiedata_difference_site2_renamed)
elif number_sites == 3:
return (magpiedata_composition_average_renamed, magpiedata_arithmetic_average_renamed,
magpiedata_max_renamed, magpiedata_min_renamed, magpiedata_difference_renamed,
magpiedata_composition_average_site1_renamed, magpiedata_arithmetic_average_site1_renamed,
magpiedata_max_site1_renamed, magpiedata_min_site1_renamed, magpiedata_difference_site1_renamed,
magpiedata_composition_average_site2_renamed, magpiedata_arithmetic_average_site2_renamed,
magpiedata_max_site2_renamed, magpiedata_min_site2_renamed, magpiedata_difference_site2_renamed,
magpiedata_composition_average_site3_renamed, magpiedata_arithmetic_average_site3_renamed,
magpiedata_max_site3_renamed, magpiedata_min_site3_renamed, magpiedata_difference_site3_renamed,
magpiedata_composition_average_site1site2_renamed, magpiedata_arithmetic_average_site1site2_renamed, magpiedata_difference_site1site2_renamed,
magpiedata_composition_average_site1site3_renamed, magpiedata_arithmetic_average_site1site3_renamed, magpiedata_difference_site1site3_renamed,
magpiedata_composition_average_site2site3_renamed, magpiedata_arithmetic_average_site2site3_renamed, magpiedata_difference_site2site3_renamed)
else:
return (magpiedata_composition_average_renamed, magpiedata_arithmetic_average_renamed,
magpiedata_max_renamed, magpiedata_min_renamed, magpiedata_difference_renamed)
def _get_atomic_magpie_features(self, composition, data_path):
# Get .table files containing feature values for each element, assign file names as feature names
magpie_feature_names = []
for f in os.listdir(data_path):
if '.table' in f:
magpie_feature_names.append(f[:-6])
composition = Composition(composition)
element_list, atoms_per_formula_unit = self._get_element_list(composition=composition)
element_dict = {}
for element in element_list:
element_dict[element] = Element(element).Z
magpiedata_atomic = {}
for k, v in element_dict.items():
atomic_values = {}
for feature_name in magpie_feature_names:
f = open(data_path + '/' + feature_name + '.table', 'r')
# Get Magpie data of relevant atomic numbers for this composition
for line, feature_value in enumerate(f.readlines()):
if line + 1 == v:
if "Missing" not in feature_value and "NA" not in feature_value:
if feature_name != "OxidationStates":
try:
atomic_values[feature_name] = float(feature_value.strip())
except ValueError:
atomic_values[feature_name] = 'NaN'
if "Missing" in feature_value:
atomic_values[feature_name] = 'NaN'
if "NA" in feature_value:
atomic_values[feature_name] = 'NaN'
f.close()
magpiedata_atomic[k] = atomic_values
return magpiedata_atomic
def _get_element_list(self, composition):
element_amounts = composition.get_el_amt_dict()
atoms_per_formula_unit = 0
for v in element_amounts.values():
atoms_per_formula_unit += v
# Get list of unique elements present
element_list = []
for k in element_amounts:
if k not in element_list:
element_list.append(k)
return element_list, atoms_per_formula_unit
[docs]class MaterialsProjectFeatureGeneration(object):
"""
Class to generate new features using Materials Project data and dataframe containing material compositions
Datarame must have a column named "Material compositions".
Args:
dataframe: (dataframe), dataframe containing x and y data and feature names
mapi_key: (str), string denoting your Materials Project API key
composition_feature: (str), string denoting a chemical composition to generate elemental features from
Methods:
generate_materialsproject_features : generates materials project feature set based on compositions in dataframe
Args:
None
Returns:
dataframe: (dataframe), dataframe containing materials project feature set
"""
def __init__(self, dataframe, mapi_key, composition_feature):
self.dataframe = dataframe
self.mapi_key = mapi_key
self.composition_feature = composition_feature
[docs] def generate_materialsproject_features(self):
try:
compositions = self.dataframe[self.composition_feature]
except KeyError as e:
raise utils.MissingColumnError(f'No column named {self.composition_feature} in csv file')
mpdata_dict_composition = {}
# before: 11 hits for a total of ~6 seconds
#for composition in compositions:
# composition_data_mp = self._get_data_from_materials_project(composition=composition)
# mpdata_dict_composition[composition] = composition_data_mp
# after: 2.5 seconds!!!
pool = multiprocessing.Pool(processes=20)
#comp_data_mp = pool.map(self._get_data_from_materials_project, compositions)
comp_data_mp = map(self._get_data_from_materials_project, compositions)
mpdata_dict_composition.update(dict(zip(compositions, comp_data_mp)))
dataframe = self.dataframe
dataframe_mp = pd.DataFrame.from_dict(data=mpdata_dict_composition, orient='index')
# Need to reorder compositions in new dataframe to match input dataframe
dataframe_mp = dataframe_mp.reindex(self.dataframe[self.composition_feature].tolist())
# Need to make compositions the first column, instead of the row names
dataframe_mp.index.name = self.composition_feature
dataframe_mp.reset_index(inplace=True)
# Need to delete duplicate column before merging dataframes
del dataframe_mp[self.composition_feature]
# Merge magpie feature dataframe with originally supplied dataframe
dataframe = DataframeUtilities().merge_dataframe_columns(dataframe1=dataframe, dataframe2=dataframe_mp)
return dataframe
def _get_data_from_materials_project(self, composition):
mprester = MPRester(self.mapi_key)
structure_data_list = mprester.get_data(chemsys_formula_id=composition)
# Sort structures by stability (i.e. E above hull), and only return most stable compound data
if len(structure_data_list) > 0:
structure_data_list = sorted(structure_data_list, key=lambda e_above: e_above['e_above_hull'])
structure_data_most_stable = structure_data_list[0]
else:
structure_data_most_stable = {}
# Trim down the full Materials Project data dict to include only quantities relevant to make features
structure_data_dict_condensed = {}
property_list = ["G_Voigt_Reuss_Hill", "G_Reuss", "K_Voigt_Reuss_Hill", "K_Reuss", "K_Voigt", "G_Voigt", "G_VRH",
"homogeneous_poisson", "poisson_ratio", "universal_anisotropy", "K_VRH", "elastic_anisotropy",
"band_gap", "e_above_hull", "formation_energy_per_atom", "nelements", "energy_per_atom", "volume",
"density", "total_magnetization", "number"]
elastic_property_list = ["G_Voigt_Reuss_Hill", "G_Reuss", "K_Voigt_Reuss_Hill", "K_Reuss", "K_Voigt", "G_Voigt",
"G_VRH", "homogeneous_poisson", "poisson_ratio", "universal_anisotropy", "K_VRH", "elastic_anisotropy"]
if len(structure_data_list) > 0:
for prop in property_list:
if prop in elastic_property_list:
try:
structure_data_dict_condensed[prop] = structure_data_most_stable["elasticity"][prop]
except TypeError:
structure_data_dict_condensed[prop] = ''
elif prop == "number":
try:
structure_data_dict_condensed["Spacegroup_"+prop] = structure_data_most_stable["spacegroup"][prop]
except TypeError:
structure_data_dict_condensed[prop] = ''
else:
try:
structure_data_dict_condensed[prop] = structure_data_most_stable[prop]
except TypeError:
structure_data_dict_condensed[prop] = ''
else:
for prop in property_list:
if prop == "number":
structure_data_dict_condensed["Spacegroup_"+prop] = ''
else:
structure_data_dict_condensed[prop] = ''
if all(val == '' for _, val in structure_data_dict_condensed.items()):
log.warning(f'No data found for composition "{composition}" using materials project')
else:
log.info(f'MAterials Project Feature Generation {composition} {structure_data_dict_condensed}')
return structure_data_dict_condensed
[docs]class DataframeUtilities(object):
"""
Class of basic utilities for dataframe manipulation, and exchanging between dataframes and numpy arrays
Args:
None
Methods:
merge_dataframe_columns : merge two dataframes by concatenating the column names (duplicate columns omitted)
Args:
dataframe1: (dataframe), a pandas dataframe object
dataframe2: (dataframe), a pandas dataframe object
Returns:
dataframe: (dataframe), merged dataframe
merge_dataframe_rows : merge two dataframes by concatenating the row contents (duplicate rows omitted)
Args:
dataframe1: (dataframe), a pandas dataframe object
dataframe2: (dataframe), a pandas dataframe object
Returns:
dataframe: (dataframe), merged dataframe
get_dataframe_statistics : obtain basic statistics about data contained in the dataframe
Args:
dataframe: (dataframe), a pandas dataframe object
Returns:
dataframe_stats: (dataframe), dataframe containing input dataframe statistics
dataframe_to_array : transform a pandas dataframe to a numpy array
Args:
dataframe: (dataframe), a pandas dataframe object
Returns:
array: (numpy array), a numpy array representation of the inputted dataframe
array_to_dataframe : transform a numpy array to a pandas dataframe
Args:
array: (numpy array), a numpy array
Returns:
dataframe: (dataframe), a pandas dataframe representation of the inputted numpy array
concatenate_arrays : merge two numpy arrays by concatenating along the columns
Args:
Xarray: (numpy array), a numpy array object
yarray: (numpy array), a numpy array object
Returns:
array: (numpy array), a numpy array merging the two input arrays
assign_columns_as_features : adds column names to dataframe based on the x and y feature names
Args:
dataframe: (dataframe), a pandas dataframe object
x_features: (list), list containing x feature names
y_feature: (str), target feature name
Returns:
dataframe: (dataframe), dataframe containing same data as input, with columns labeled with features
save_all_dataframe_statistics : obtain dataframe statistics and save it to a csv file
Args:
dataframe: (dataframe), a pandas dataframe object
data_path: (str), file path to save dataframe statistics to
Returns:
fname: (str), name of file dataframe stats saved to
"""
[docs] @classmethod
def merge_dataframe_columns(cls, dataframe1, dataframe2):
dataframe = pd.concat([dataframe1, dataframe2], axis=1, join='outer')
return dataframe
[docs] @classmethod
def merge_dataframe_rows(cls, dataframe1, dataframe2):
dataframe = pd.merge(left=dataframe1, right=dataframe2, how='outer')
#dataframe = pd.concat([dataframe1, dataframe2], axis=1, join='outer')
return dataframe
[docs] @classmethod
def get_dataframe_statistics(cls, dataframe):
dataframe_stats = dataframe.describe(include='all')
return dataframe_stats
[docs] @classmethod
def dataframe_to_array(cls, dataframe):
array = np.asarray(dataframe)
return array
[docs] @classmethod
def array_to_dataframe(cls, array):
dataframe = pd.DataFrame(data=array, index=range(0, len(array)))
return dataframe
[docs] @classmethod
def concatenate_arrays(cls, X_array, y_array):
array = np.concatenate((X_array, y_array), axis=1)
return array
[docs] @classmethod
def assign_columns_as_features(cls, dataframe, x_features, y_feature, remove_first_row=True):
column_dict = {}
x_and_y_features = [feature for feature in x_features]
x_and_y_features.append(y_feature)
for i, feature in enumerate(x_and_y_features):
column_dict[i] = feature
dataframe = dataframe.rename(columns=column_dict)
if remove_first_row == bool(True):
dataframe = dataframe.drop([0]) # Need to remove feature names from first row so can obtain data
return dataframe
[docs] @classmethod
def save_all_dataframe_statistics(cls, dataframe, configdict):
dataframe_stats = cls.get_dataframe_statistics(dataframe=dataframe)
# Need configdict to get save path
#if not configfile_path:
# configdict = ConfigFileParser(configfile=sys.argv[1]).get_config_dict(path_to_file=os.getcwd())
#data_path_name = data_path.split('./')[1]
#data_path_name = data_path_name.split('.csv')[0]
# data_path_name = configdict['General Setup']['target_feature']
#else:
# configdict = ConfigFileParser(configfile=configfile_name).get_config_dict(path_to_file=configfile_path)
data_path_name = configdict['General Setup']['target_feature'] # TODO
fname = configdict['General Setup']['save_path'] + "/" + 'input_data_statistics_'+data_path_name+'.csv'
dataframe_stats.to_csv(fname, index=True)
return fname
# Old Citrine classes likely to be deleted
"""
class Citrine(BaseEstimator, TransformerMixin):
Class that wraps CitrineFeatureGeneration, giving it scikit-learn structure
Args:
composition_feature: (str), string denoting a chemical composition to generate elemental features from
api_key: (str), string denoting your Citrine API key
Methods:
fit: pass through, copies input columns as pre-generated features
Args:
df: (dataframe), input dataframe containing X and y data
transform: generate Citrine features
Args:
df: (dataframe), input dataframe containing X and y data
Returns:
df: (dataframe), output dataframe containing generated features, original features and y data
def __init__(self, composition_feature, api_key):
self.composition_feature = composition_feature
self.api_key = api_key
def fit(self, df, y=None):
self.original_features = df.columns
return self
def transform(self, df):
# make citrine api call (uses internet)
cfg = CitrineFeatureGeneration(df.copy(), self.api_key, self.composition_feature)
df = cfg.generate_citrine_features()
df = df.drop(self.original_features, axis=1)
# delete missing values, generation makes a lot of garbage.
df = clean_dataframe(df)
assert self.composition_feature not in df.columns
return df
"""
"""
class CitrineFeatureGeneration(object):
Class to generate new features using Citrine data and dataframe containing material compositions
Datarame must have a column named "Material compositions".
Args:
dataframe: (dataframe), dataframe containing x and y data and feature names
api_key: (str), your Citrination API key
composition_feature: (str), string denoting a chemical composition to generate elemental features from
Methods:
generate_citrine_features : generates Citrine feature set based on compositions in dataframe
Args:
None
Returns:
dataframe: (dataframe), dataframe containing citrine generated feature set
def __init__(self, dataframe, api_key, composition_feature):
self.dataframe = dataframe
self.api_key = api_key
self.client = CitrinationClient(api_key, 'https://citrination.com')
self.composition_feature = composition_feature
def generate_citrine_features(self):
log.warning('WARNING: You have specified generation of features from Citrine. Based on which'
' materials you are interested in, there may be many records to parse through, thus'
' this routine may take a long time to complete!')
try:
compositions = self.dataframe[self.composition_feature].tolist()
except KeyError as e:
log.error(f'original python error: {str(e)}')
raise utils.MissingColumnError('Error! No column named {self.composition_feature} found in your input data file. '
'To use this feature generation routine, you must supply a material composition for each data point')
citrine_dict_property_min = dict()
citrine_dict_property_max = dict()
citrine_dict_property_avg = dict()
# before: ~11 seconds
# made into a func so we can do requests in parallel
# now like 1.8 secs!
pool = multiprocessing.Pool(processes=20)
#result_tuples = pool.map(self._load_composition, compositions)
result_tuples = map(self._load_composition, compositions)
for comp, (prop_min, prop_max, prop_avg) in zip(compositions, result_tuples):
citrine_dict_property_min[comp] = prop_min
citrine_dict_property_max[comp] = prop_max
citrine_dict_property_avg[comp] = prop_avg
dataframe = self.dataframe
citrine_dict_list = [citrine_dict_property_min, citrine_dict_property_max, citrine_dict_property_avg]
for citrine_dict in citrine_dict_list:
dataframe_citrine = pd.DataFrame.from_dict(data=citrine_dict, orient='index')
# Need to reorder compositions in new dataframe to match input dataframe
dataframe_citrine = dataframe_citrine.reindex(self.dataframe[self.composition_feature].tolist())
# Need to make compositions the first column, instead of the row names
dataframe_citrine.index.name = self.composition_feature
dataframe_citrine.reset_index(inplace=True)
# Need to delete duplicate column before merging dataframes
del dataframe_citrine[self.composition_feature]
# Merge magpie feature dataframe with originally supplied dataframe
dataframe = DataframeUtilities().merge_dataframe_columns(dataframe1=dataframe, dataframe2=dataframe_citrine)
return dataframe
def _load_composition(self, composition):
pifquery = self._get_pifquery(composition=composition)
property_name_list, property_value_list = self._get_pifquery_property_list(pifquery=pifquery)
#print("Citrine Feature Generation: ", composition, property_name_list, property_value_list)
property_names_unique, parsed_property_min, parsed_property_max, parsed_property_avg = self._parse_pifquery_property_list(property_name_list=property_name_list, property_value_list=property_value_list)
return parsed_property_min, parsed_property_max, parsed_property_avg
def _get_pifquery(self, composition):
# TODO: does this stop csv generation on first invalid composition?
# TODO: Is there a way to send many compositions in one call to citrine?
pif_query = PifQuery(system=SystemQuery(chemical_formula=ChemicalFieldQuery(filter=ChemicalFilter(equal=composition))))
# Check if any results found
if 'hits' not in self.client.search(pif_query).as_dictionary():
raise KeyError('No results found!')
pifquery = self.client.search(pif_query).as_dictionary()['hits']
return pifquery
def _get_pifquery_property_list(self, pifquery):
property_name_list = list()
property_value_list = list()
accepted_properties_list = [
'mass', 'space group', 'band', 'Band', 'energy', 'volume', 'density', 'dielectric',
'Dielectric', 'Enthalpy', 'Convex', 'Magnetization', 'Elements', 'Modulus', 'Shear',
"Poisson's", 'Elastic', 'Energy'
]
for result_number, results in enumerate(pifquery):
for i, dictionary in enumerate(results['system']['properties']):
if 'name' not in dictionary or dictionary['name'] == "CIF File": continue
value = dictionary['name']
for entry in accepted_properties_list:
if entry not in value: continue
property_name_list.append(value)
try:
property_value_list.append(
float(dictionary['scalars'][0]['value']))
except (ValueError, KeyError):
property_name_list.pop(-1)
continue
#for result_number, results in enumerate(pifquery):
# property_value = results['system']['properties']
# for list_index, list_element in enumerate(property_value):
# for name, value in property_value[list_index].items():
# if name == 'name' and value != "CIF File":
# for entry in accepted_properties_list:
# if entry in value:
# property_name_list.append(value)
# try:
# property_value_list.append(
# float(property_value[list_index]['scalars'][0]['value']))
# except (ValueError, KeyError):
# # print('found something to remove', property_value[list_index]['scalars'][0]['value'])
# property_name_list.pop(-1)
# continue
return property_name_list, property_value_list
def _parse_pifquery_property_list(self, property_name_list, property_value_list):
parsed_property_max = dict()
parsed_property_min = dict()
parsed_property_avg = dict()
property_names_unique = list()
if len(property_name_list) != len(property_value_list):
print('Error! Length of property name and property value lists are not the same. There must be a bug in the _get_pifquerey_property_list method')
raise IndexError("property_name_list and property_value_list are not the same size.")
else:
# Get unique property names
for name in property_name_list:
if name not in property_names_unique:
property_names_unique.append(name)
for unique_name in property_names_unique:
unique_property = list()
unique_property_avg = 0
count = 0
for i, name in enumerate(property_name_list):
# Only include property values whose name are same as those in unique_name list
if name == unique_name:
count += 1 # count how many instances of the same property occur
unique_property_avg += property_value_list[i]
unique_property.append(property_value_list[i])
unique_property_min = min(entry for entry in unique_property)
unique_property_max = max(entry for entry in unique_property)
unique_property_avg = unique_property_avg/count
parsed_property_min[str(unique_name)+"_min"] = unique_property_min
parsed_property_max[str(unique_name) + "_max"] = unique_property_max
parsed_property_avg[str(unique_name) + "_avg"] = unique_property_avg
return property_names_unique, parsed_property_min, parsed_property_max, parsed_property_avg
"""