Composition generation

Here, we generate a set of quaternary oxide compositions using a modified smact_filter function and then turn the results into a dataframe with features that can be read by a machine learning algorithm.


In [1]:
### Imports ###
import smact
from smact import screening
from itertools import combinations, product
import multiprocessing
from pymatgen import Composition
import pandas as pd
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers import composition as cf

In [2]:
# Define the elements we are interested in
all_el = smact.element_dictionary()
symbol_list = [k for k,i in all_el.items()]
do_not_want = ['H', 'He', 'B', 'C', 'O', 'Ne', 'Ar', 'Kr', 'Tc', 'Xe', 'Rn',
              'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 
              'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr',
              'Ra', 'Fr', 'At', 'Po', 'Pm', 'Eu', 'Tb', 'Yb']
good_elements = [all_el[x] for x in symbol_list if x not in do_not_want]

all_el_combos = combinations(good_elements,3)

def smact_filter(els):
    all_compounds = []
    elements = [e.symbol for e in els] + ['O']
    
    # Get Pauling electronegativities
    paul_a, paul_b, paul_c = els[0].pauling_eneg, els[1].pauling_eneg, els[2].pauling_eneg
    electronegativities = [paul_a, paul_b, paul_c, 3.44]
    
    # For each set of species (in oxidation states) apply both SMACT tests
    for ox_a, ox_b, ox_c in product(els[0].oxidation_states, 
                    els[1].oxidation_states, els[2].oxidation_states):      
        ox_states = [ox_a, ox_b, ox_c, -2]
        # Test for charge balance
        cn_e, cn_r = smact.neutral_ratios(ox_states, threshold = 8)
        if cn_e:
            # Electronegativity test
            electroneg_OK = screening.pauling_test(ox_states, electronegativities)
            if electroneg_OK:
                compound = tuple([elements,cn_r[0]])
                all_compounds.append(compound)
    return all_compounds

Multiprocessing is used to speed things up (generation of all compositions takes ~40 minutes on a 4GHz Intel core i7 iMac).


In [3]:
with multiprocessing.Pool() as p:
    result = p.map(smact_filter, all_el_combos)
    
flat_list = [item for sublist in result for item in sublist]
print("Number of compositions: {0}".format(len(flat_list)))


Number of compositions: 3217181

We turn our generated compositions into pretty formulas, again using multiprocessing. There should be ~1.1M unique formulas.


In [4]:
def comp_maker(comp):
    form = []
    for el, ammt in zip(comp[0], comp[1]):
        form.append(el)
        form.append(ammt)
    form = ''.join(str(e) for e in form)
    pmg_form = Composition(form).reduced_formula
    return pmg_form

with multiprocessing.Pool() as p:
    pretty_formulas = p.map(comp_maker, flat_list)
    
unique_pretty_formulas = list(set(pretty_formulas))
print("Number of unique compositions formulas: {0}".format(len(unique_pretty_formulas)))


Number of unique compositions formulas: 1118505

In [21]:
new_data = pd.DataFrame(unique_pretty_formulas).rename(columns={0: 'pretty_formula'})
new_data = new_data.drop_duplicates(subset = 'pretty_formula')
new_data.describe()


Out[21]:
pretty_formula
count 1118505
unique 1118505
top Sm2VBrO5
freq 1

In [22]:
# Add descriptor columns
# this will take a little time as we have over 1 million rows
str_to_comp = StrToComposition(target_col_id='composition_obj')
str_to_comp.featurize_dataframe(new_data, col_id='pretty_formula')

feature_calculators = MultipleFeaturizer([cf.Stoichiometry(), 
                                          cf.ElementProperty.from_preset("magpie"),
                                          cf.ValenceOrbital(props=['avg']), 
                                          cf.IonProperty(fast=True),
                                          cf.BandCenter(), cf.AtomicOrbitals()])

feature_labels = feature_calculators.feature_labels()
feature_calculators.featurize_dataframe(new_data, col_id='composition_obj');





In [30]:
# Save as .csv file 
new_data.to_csv('All_oxide_comps_dataframe_featurized.csv', chunksize=10000)