In [1]:
### Imports ###
import smact
from smact import screening
from itertools import combinations, product
import multiprocessing
from pymatgen import Composition
import pandas as pd
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers import composition as cf
In [2]:
# Define the elements we are interested in
all_el = smact.element_dictionary()
symbol_list = [k for k,i in all_el.items()]
do_not_want = ['H', 'He', 'B', 'C', 'O', 'Ne', 'Ar', 'Kr', 'Tc', 'Xe', 'Rn',
'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk',
'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr',
'Ra', 'Fr', 'At', 'Po', 'Pm', 'Eu', 'Tb', 'Yb']
good_elements = [all_el[x] for x in symbol_list if x not in do_not_want]
all_el_combos = combinations(good_elements,3)
def smact_filter(els):
all_compounds = []
elements = [e.symbol for e in els] + ['O']
# Get Pauling electronegativities
paul_a, paul_b, paul_c = els[0].pauling_eneg, els[1].pauling_eneg, els[2].pauling_eneg
electronegativities = [paul_a, paul_b, paul_c, 3.44]
# For each set of species (in oxidation states) apply both SMACT tests
for ox_a, ox_b, ox_c in product(els[0].oxidation_states,
els[1].oxidation_states, els[2].oxidation_states):
ox_states = [ox_a, ox_b, ox_c, -2]
# Test for charge balance
cn_e, cn_r = smact.neutral_ratios(ox_states, threshold = 8)
if cn_e:
# Electronegativity test
electroneg_OK = screening.pauling_test(ox_states, electronegativities)
if electroneg_OK:
compound = tuple([elements,cn_r[0]])
all_compounds.append(compound)
return all_compounds
Multiprocessing is used to speed things up (generation of all compositions takes ~40 minutes on a 4GHz Intel core i7 iMac).
In [3]:
with multiprocessing.Pool() as p:
result = p.map(smact_filter, all_el_combos)
flat_list = [item for sublist in result for item in sublist]
print("Number of compositions: {0}".format(len(flat_list)))
We turn our generated compositions into pretty formulas, again using multiprocessing. There should be ~1.1M unique formulas.
In [4]:
def comp_maker(comp):
form = []
for el, ammt in zip(comp[0], comp[1]):
form.append(el)
form.append(ammt)
form = ''.join(str(e) for e in form)
pmg_form = Composition(form).reduced_formula
return pmg_form
with multiprocessing.Pool() as p:
pretty_formulas = p.map(comp_maker, flat_list)
unique_pretty_formulas = list(set(pretty_formulas))
print("Number of unique compositions formulas: {0}".format(len(unique_pretty_formulas)))
In [21]:
new_data = pd.DataFrame(unique_pretty_formulas).rename(columns={0: 'pretty_formula'})
new_data = new_data.drop_duplicates(subset = 'pretty_formula')
new_data.describe()
Out[21]:
In [22]:
# Add descriptor columns
# this will take a little time as we have over 1 million rows
str_to_comp = StrToComposition(target_col_id='composition_obj')
str_to_comp.featurize_dataframe(new_data, col_id='pretty_formula')
feature_calculators = MultipleFeaturizer([cf.Stoichiometry(),
cf.ElementProperty.from_preset("magpie"),
cf.ValenceOrbital(props=['avg']),
cf.IonProperty(fast=True),
cf.BandCenter(), cf.AtomicOrbitals()])
feature_labels = feature_calculators.feature_labels()
feature_calculators.featurize_dataframe(new_data, col_id='composition_obj');
In [30]:
# Save as .csv file
new_data.to_csv('All_oxide_comps_dataframe_featurized.csv', chunksize=10000)