Composition generation

Here, we generate a set of quaternary oxide compositions using a modified smact_filter function and then turn the results into a dataframe with features that can be read by a machine learning algorithm.



In [1]:

    
### Imports ###
import smact
from smact import screening
from itertools import combinations, product
import multiprocessing
from pymatgen import Composition
import pandas as pd
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers import composition as cf



In [2]:

    
# Define the elements we are interested in
all_el = smact.element_dictionary()
symbol_list = [k for k,i in all_el.items()]
do_not_want = ['H', 'He', 'B', 'C', 'O', 'Ne', 'Ar', 'Kr', 'Tc', 'Xe', 'Rn',
              'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 
              'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr',
              'Ra', 'Fr', 'At', 'Po', 'Pm', 'Eu', 'Tb', 'Yb']
good_elements = [all_el[x] for x in symbol_list if x not in do_not_want]

all_el_combos = combinations(good_elements,3)

def smact_filter(els):
    all_compounds = []
    elements = [e.symbol for e in els] + ['O']
    
    # Get Pauling electronegativities
    paul_a, paul_b, paul_c = els[0].pauling_eneg, els[1].pauling_eneg, els[2].pauling_eneg
    electronegativities = [paul_a, paul_b, paul_c, 3.44]
    
    # For each set of species (in oxidation states) apply both SMACT tests
    for ox_a, ox_b, ox_c in product(els[0].oxidation_states, 
                    els[1].oxidation_states, els[2].oxidation_states):      
        ox_states = [ox_a, ox_b, ox_c, -2]
        # Test for charge balance
        cn_e, cn_r = smact.neutral_ratios(ox_states, threshold = 8)
        if cn_e:
            # Electronegativity test
            electroneg_OK = screening.pauling_test(ox_states, electronegativities)
            if electroneg_OK:
                compound = tuple([elements,cn_r[0]])
                all_compounds.append(compound)
    return all_compounds

Multiprocessing is used to speed things up (generation of all compositions takes ~40 minutes on a 4GHz Intel core i7 iMac).



In [3]:

    
with multiprocessing.Pool() as p:
    result = p.map(smact_filter, all_el_combos)
    
flat_list = [item for sublist in result for item in sublist]
print("Number of compositions: {0}".format(len(flat_list)))









    



Number of compositions: 3217181

We turn our generated compositions into pretty formulas, again using multiprocessing. There should be ~1.1M unique formulas.



In [4]:

    
def comp_maker(comp):
    form = []
    for el, ammt in zip(comp[0], comp[1]):
        form.append(el)
        form.append(ammt)
    form = ''.join(str(e) for e in form)
    pmg_form = Composition(form).reduced_formula
    return pmg_form

with multiprocessing.Pool() as p:
    pretty_formulas = p.map(comp_maker, flat_list)
    
unique_pretty_formulas = list(set(pretty_formulas))
print("Number of unique compositions formulas: {0}".format(len(unique_pretty_formulas)))









    



Number of unique compositions formulas: 1118505



In [21]:

    
new_data = pd.DataFrame(unique_pretty_formulas).rename(columns={0: 'pretty_formula'})
new_data = new_data.drop_duplicates(subset = 'pretty_formula')
new_data.describe()









    Out[21]:







  
    
      
      pretty_formula
    
  
  
    
      count
      1118505
    
    
      unique
      1118505
    
    
      top
      Sm2VBrO5
    
    
      freq
      1



In [22]:

    
# Add descriptor columns
# this will take a little time as we have over 1 million rows
str_to_comp = StrToComposition(target_col_id='composition_obj')
str_to_comp.featurize_dataframe(new_data, col_id='pretty_formula')

feature_calculators = MultipleFeaturizer([cf.Stoichiometry(), 
                                          cf.ElementProperty.from_preset("magpie"),
                                          cf.ValenceOrbital(props=['avg']), 
                                          cf.IonProperty(fast=True),
                                          cf.BandCenter(), cf.AtomicOrbitals()])

feature_labels = feature_calculators.feature_labels()
feature_calculators.featurize_dataframe(new_data, col_id='composition_obj');



In [30]:

    
# Save as .csv file 
new_data.to_csv('All_oxide_comps_dataframe_featurized.csv', chunksize=10000)