In [1]:

    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import string

Idea

Get data

Calculate the name length
Calculate the chr set
Calculate the chr set length
Calculate the ratio for the chr set length and the name length
Remove the duplicate letter sets
Create dataframe with index=names, columns=alphabet
Calculate the letter distribution
Choose argmin(letter sum); The optimum set must have atleast one of these
Iterate through all argmin(letter sum) names:
- Recursion starts here
- Mark all name letters to False
- Update the letter distribution
- Choose argmin(letter sum); The optimum set must have atleast one of these, but due to n cutoff not all combinations are tested.
- Calculate the effective set length
- Calculate the effective ratio
- Choose the n first names with {the highest effective ratio / shortest length}
- Iterate through the chosen names
- The recursion ends here

Read data and calculate some properties



In [2]:

    
names_df = pd.read_csv("./IMA_mineral_names.txt", sep=',', header=None, names=['names'])
names_df['names'] = names_df['names'].str.strip().str.lower()



In [3]:

    
names_df['len'] = names_df['names'].str.len()



In [4]:

    
names_df['tuple'] = names_df['names'].apply(lambda x: tuple(sorted(set(x))))



In [5]:

    
names_df['setlen'] = names_df['tuple'].apply(lambda x: len(x))



In [6]:

    
names_df['set_per_len'] = names_df['setlen']/names_df['len']



In [7]:

    
names_df.head(5)









    Out[7]:






  
    
      
      names
      len
      tuple
      setlen
      set_per_len
    
  
  
    
      0
      abellaite
      9
      (a, b, e, i, l, t)
      6
      0.666667
    
    
      1
      abelsonite
      10
      (a, b, e, i, l, n, o, s, t)
      9
      0.900000
    
    
      2
      abernathyite
      12
      (a, b, e, h, i, n, r, t, y)
      9
      0.750000
    
    
      3
      abhurite
      8
      (a, b, e, h, i, r, t, u)
      8
      1.000000
    
    
      4
      abramovite
      10
      (a, b, e, i, m, o, r, t, v)
      9
      0.900000



In [8]:

    
len(names_df)









    Out[8]:





3912

Remove duplicates



In [9]:

    
def sort_and_return_smallest(df):
    if len(df) == 1:
        return df
    df = df.sort_values(by=['len', 'names'])
    return df.iloc[:1, :]



In [10]:

    
%time names_set = names_df.groupby(by='tuple', as_index=False).apply(sort_and_return_smallest)
len(names_set)









    



Wall time: 7.92 s






    Out[10]:





3187



In [11]:

    
def sort_and_return_smallest_duplicates(df):
    if len(df) == 1:
        return list(df['names'])
    df = df.sort_values(by=['len', 'names'])
    names = df.loc[df['len'] == df['len'].iloc[0], 'names']
    return list(names)



In [12]:

    
%time names_duplicates = names_df.groupby(by='tuple', as_index=False).apply(sort_and_return_smallest_duplicates)
len(names_duplicates)









    



Wall time: 3.03 s






    Out[12]:





3187



In [13]:

    
# In case some of these are in the chosen set
duplicate_name_dict = {}
for value in names_duplicates:
    if len(value) > 1:
        duplicate_name_dict[value[0]] = value[1:]



In [14]:

    
names_set.set_index(['names'], inplace=True)
names_set.head()









    Out[14]:






  
    
      
      len
      tuple
      setlen
      set_per_len
    
    
      names
      
      
      
      
    
  
  
    
      addibischoffite
      15
      (a, b, c, d, e, f, h, i, o, s, t)
      11
      0.733333
    
    
      molybdofornacite
      16
      (a, b, c, d, e, f, i, l, m, n, o, r, t, y)
      14
      0.875000
    
    
      burckhardtite
      13
      (a, b, c, d, e, h, i, k, r, t, u)
      11
      0.846154
    
    
      buchwaldite
      11
      (a, b, c, d, e, h, i, l, t, u, w)
      11
      1.000000
    
    
      medenbachite
      12
      (a, b, c, d, e, h, i, m, n, t)
      10
      0.833333

Create letter table



In [15]:

    
letter_df = pd.DataFrame(index=names_set.index, columns=list(string.ascii_lowercase), dtype=bool)
letter_df.loc[:] = False



In [16]:

    
%%time
for name, set_ in zip(names_set.index, names_set['tuple']):
    for letter in set_:
        letter_df.loc[name, letter] = True









    



Wall time: 11.2 s

Find argmin in the letter distribution



In [17]:

    
lowest_count_letter = letter_df.sum(0).argmin()
lowest_count_letter









    Out[17]:





'q'



In [18]:

    
# Get subset based on the chosen letter
subsetlen = letter_df[letter_df[lowest_count_letter]].sum(1)
name_len = subsetlen.index.str.len()

setlen = pd.DataFrame({'set_per_len' : subsetlen/name_len, 'len' : name_len})
setlen.head()









    Out[18]:






  
    
      
      len
      set_per_len
    
    
      names
      
      
    
  
  
    
      aluminocoquimbite
      17
      0.705882
    
    
      barquillite
      11
      0.818182
    
    
      jacquesdietrichite
      18
      0.666667
    
    
      coquandite
      10
      1.000000
    
    
      quetzalcoatlite
      15
      0.666667

Recursion



In [58]:

    
def get_min_set(df, current_items, m=46, sort_by_len=False, n_search=20):
    # Gather results
    results = []
    # Get letter with lowest number of options
    letter = df.sum(0)
    letter = letter[letter > 0].argmin()
    # Get subset based on the chosen letter
    subsetlen = df.loc[df[letter], :].sum(1)
    name_len = subsetlen.index.str.len()
    
    setlen = pd.DataFrame({'set_per_len' : subsetlen/name_len, 'len' : name_len})
    if sort_by_len:
        order_of_operations = setlen.sort_values(by=['len', 'set_per_len'], ascending=True).index
    else:
        order_of_operations = setlen.sort_values(by=['set_per_len', 'len'], ascending=False).index
    # Loop over the mineral names with chosen letter
    # Ordered based on the (setlen / len)
    for i, (name, letter_bool) in enumerate(df.loc[order_of_operations, :].iterrows()):
        if i > n_search:
            break
        if sum(map(len, current_items))+len(name) >= m:
            continue
        # Get df containing rest of the letters
        df_ = df.copy()
        df_.loc[:, letter_bool] = False
        
        # If letters are exhausted there is one result
        # Check if the result is less than chosen limit m
        if df_.sum(0).sum() == 0 and sum(map(len, current_items))+len(name) < m:
            # This result is "the most optimal" under these names
            current_items_ = current_items + [name]
            len_current_items_ = sum(map(len, current_items_))
            len_unique = len(set("".join(current_items_)))
            results.append((len_current_items_, current_items_))
            if len_current_items_ < 41:
                print("len", len_current_items_, "len_unique", len_unique, current_items_, "place 1", flush=True)
            continue
        
        # Remove mineral names without new letters 
        df_ = df_.loc[df_.sum(1) != 0, :]
        
        if df_.sum(0).sum() == 0:
            if sum(map(len, current_items))+len(name) < m:
                unique_letters = sum(map(len, map(set, current_items + [name])))
                if unique_letters == len(string.ascii_lowercase):
                    # Here is one result (?)
                    current_items_ = current_items + [name]
                    len_current_items_ = sum(map(len, current_items_))
                    len_unique = len(set("".join(current_items_)))
                    results.append((len_current_items_, current_items_))
                    if len_current_items_ < 41:
                        print("len", len_current_items_, "len_unique", len_unique, current_items_, "place 1", flush=True)
            continue
        
        current_items_ = current_items + [name]
        optimal_result = get_min_set(df_, current_items_, m=m, sort_by_len=sort_by_len, n_search=n_search)
        if len(optimal_result):
            results.extend(optimal_result)
    return results

The effective ratio criteria



In [59]:

    
%%time
res_list = []
order_of_oparations = setlen.loc[letter_df.loc[:, lowest_count_letter], :].sort_values(by=['set_per_len', 'len'], ascending=False).index

for i, (name, letter_bool) in enumerate(letter_df.ix[order_of_oparations].iterrows()):
    print(name, i+1, "/", len(order_of_oparations), flush=True)
    df_ = letter_df.copy()
    df_.loc[:, letter_bool] = False
    res = get_min_set(df_, [name], m=45, sort_by_len=False, n_search=20)
    res_list.extend(res)









    



coquandite 1 / 40
qusongite 2 / 40
naquite 3 / 40
quartz 4 / 40
len 40 len_unique 26 ['quartz', 'vanoxite', 'jedwabite', 'fukuchilite', 'gypsum'] place 1
len 40 len_unique 26 ['quartz', 'xifengite', 'jedwabite', 'chkalovite', 'gypsum'] place 1
goldquarryite 5 / 40
quadruphite 6 / 40
coquimbite 7 / 40
qandilite 8 / 40
turquoise 9 / 40
quadridavyne 10 / 40
mapiquiroite 11 / 40
holmquistite 12 / 40
barquillite 13 / 40
macquartite 14 / 40
quijarroite 15 / 40
colquiriite 16 / 40
quadratite 17 / 40
quenselite 18 / 40
penobsquisite 19 / 40
pirquitasite 20 / 40
lindqvistite 21 / 40
nesquehonite 22 / 40
taseqite 23 / 40
qingsongite 24 / 40
rodalquilarite 25 / 40
aluminocoquimbite 26 / 40
manganoquadratite 27 / 40
qatranaite 28 / 40
qingheiite 29 / 40
esquireite 30 / 40
qilianshanite 31 / 40
jacquesdietrichite 32 / 40
quetzalcoatlite 33 / 40
tranquillityite 34 / 40
quenstedtite 35 / 40
queitite 36 / 40
qitianlingite 37 / 40
arsenquatrandorite 38 / 40
strontiojoaquinite 39 / 40
quintinite 40 / 40
Wall time: 2h 12min 33s



In [60]:

    
res_df = pd.DataFrame([[item[0]] + item[1] for item in res_list]).sort_values(by=0)



In [61]:

    
res_df.head()









    Out[61]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
    
  
  
    
      140
      40
      quartz
      vanoxite
      jedwabite
      fukuchilite
      gypsum
      None
      None
    
    
      231
      40
      quartz
      xifengite
      jedwabite
      chkalovite
      gypsum
      None
      None
    
    
      64
      41
      quartz
      wilcoxite
      vajdakite
      hafnon
      beryl
      gypsum
      None
    
    
      144
      41
      quartz
      vanoxite
      jedwabite
      feklichevite
      gypsum
      None
      None
    
    
      109
      41
      quartz
      haxonite
      jedwabite
      feklichevite
      gypsum
      None
      None

The shortest name length criteria



In [64]:

    
%%time
res_list_ = []
order_of_oparations = setlen.loc[letter_df.loc[:, lowest_count_letter], :].sort_values(by=['set_per_len', 'len'], ascending=False).index

for i, (name, letter_bool) in enumerate(letter_df.ix[order_of_oparations].iterrows()):
    print(name, i+1, "/", len(order_of_oparations), flush=True)
    df_ = letter_df.copy()
    df_.loc[:, letter_bool] = False
    res_ = get_min_set(df_, [name], m=45, sort_by_len=True, n_search=20)
    res_list_.extend(res_)









    



 





    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
KeyboardInterrupt:



In [72]:

    
#res_df_ = pd.DataFrame([[item[0]] + item[1] for item in res_list_]).sort_values(by=0)



In [81]:

    
res_df.shape #, res_df_.shape









    Out[81]:





(287, 8)

Save the results



In [82]:

    
%time res_df.to_csv("./example_but_not_optimum_no_duplicates.csv")









    



Wall time: 15.6 ms



In [83]:

    
optimum = res_df[res_df[0] == res_df.iloc[0, 0]]

Check for duplicates



In [84]:

    
optimum.iloc[:, 1:].applymap(lambda x: duplicate_name_dict.get(x, None))









    Out[84]:






  
    
      
      1
      2
      3
      4
      5
      6
      7
    
  
  
    
      140
      None
      None
      None
      None
      None
      None
      None
    
    
      231
      None
      None
      None
      None
      None
      None
      None



In [85]:

    
optimum









    Out[85]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
    
  
  
    
      140
      40
      quartz
      vanoxite
      jedwabite
      fukuchilite
      gypsum
      None
      None
    
    
      231
      40
      quartz
      xifengite
      jedwabite
      chkalovite
      gypsum
      None
      None

Validate results



In [86]:

    
optimum.apply(lambda x: "".join(sorted(set("".join(x.iloc[1:6].values)))) == string.ascii_lowercase, axis=1)









    Out[86]:





140    True
231    True
dtype: bool



In [ ]:

	names	len	tuple	setlen	set_per_len
0	abellaite	9	(a, b, e, i, l, t)	6	0.666667
1	abelsonite	10	(a, b, e, i, l, n, o, s, t)	9	0.900000
2	abernathyite	12	(a, b, e, h, i, n, r, t, y)	9	0.750000
3	abhurite	8	(a, b, e, h, i, r, t, u)	8	1.000000
4	abramovite	10	(a, b, e, i, m, o, r, t, v)	9	0.900000

	len	tuple	setlen	set_per_len
names
addibischoffite	15	(a, b, c, d, e, f, h, i, o, s, t)	11	0.733333
molybdofornacite	16	(a, b, c, d, e, f, i, l, m, n, o, r, t, y)	14	0.875000
burckhardtite	13	(a, b, c, d, e, h, i, k, r, t, u)	11	0.846154
buchwaldite	11	(a, b, c, d, e, h, i, l, t, u, w)	11	1.000000
medenbachite	12	(a, b, c, d, e, h, i, m, n, t)	10	0.833333

	len	set_per_len
names
aluminocoquimbite	17	0.705882
barquillite	11	0.818182
jacquesdietrichite	18	0.666667
coquandite	10	1.000000
quetzalcoatlite	15	0.666667

	0	1	2	3	4	5	6	7
140	40	quartz	vanoxite	jedwabite	fukuchilite	gypsum	None	None
231	40	quartz	xifengite	jedwabite	chkalovite	gypsum	None	None
64	41	quartz	wilcoxite	vajdakite	hafnon	beryl	gypsum	None
144	41	quartz	vanoxite	jedwabite	feklichevite	gypsum	None	None
109	41	quartz	haxonite	jedwabite	feklichevite	gypsum	None	None