In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import string

Idea

Get data

  • Calculate the name length
  • Calculate the chr set
  • Calculate the chr set length
  • Calculate the ratio for the chr set length and the name length
  • Remove the duplicate letter sets
  • Create dataframe with index=names, columns=alphabet
  • Calculate the letter distribution
  • Choose argmin(letter sum); The optimum set must have atleast one of these
  • Iterate through all argmin(letter sum) names:
    • Recursion starts here
    • Mark all name letters to False
    • Update the letter distribution
    • Choose argmin(letter sum); The optimum set must have atleast one of these, but due to n cutoff not all combinations are tested.
    • Calculate the effective set length
    • Calculate the effective ratio
    • Choose the n first names with {the highest effective ratio / shortest length}
    • Iterate through the chosen names
    • The recursion ends here

Read data and calculate some properties


In [2]:
names_df = pd.read_csv("./IMA_mineral_names.txt", sep=',', header=None, names=['names'])
names_df['names'] = names_df['names'].str.strip().str.lower()

In [3]:
names_df['len'] = names_df['names'].str.len()

In [4]:
names_df['tuple'] = names_df['names'].apply(lambda x: tuple(sorted(set(x))))

In [5]:
names_df['setlen'] = names_df['tuple'].apply(lambda x: len(x))

In [6]:
names_df['set_per_len'] = names_df['setlen']/names_df['len']

In [7]:
names_df.head(5)


Out[7]:
names len tuple setlen set_per_len
0 abellaite 9 (a, b, e, i, l, t) 6 0.666667
1 abelsonite 10 (a, b, e, i, l, n, o, s, t) 9 0.900000
2 abernathyite 12 (a, b, e, h, i, n, r, t, y) 9 0.750000
3 abhurite 8 (a, b, e, h, i, r, t, u) 8 1.000000
4 abramovite 10 (a, b, e, i, m, o, r, t, v) 9 0.900000

In [8]:
len(names_df)


Out[8]:
3912

Remove duplicates


In [9]:
def sort_and_return_smallest(df):
    if len(df) == 1:
        return df
    df = df.sort_values(by=['len', 'names'])
    return df.iloc[:1, :]

In [10]:
%time names_set = names_df.groupby(by='tuple', as_index=False).apply(sort_and_return_smallest)
len(names_set)


Wall time: 7.92 s
Out[10]:
3187

In [11]:
def sort_and_return_smallest_duplicates(df):
    if len(df) == 1:
        return list(df['names'])
    df = df.sort_values(by=['len', 'names'])
    names = df.loc[df['len'] == df['len'].iloc[0], 'names']
    return list(names)

In [12]:
%time names_duplicates = names_df.groupby(by='tuple', as_index=False).apply(sort_and_return_smallest_duplicates)
len(names_duplicates)


Wall time: 3.03 s
Out[12]:
3187

In [13]:
# In case some of these are in the chosen set
duplicate_name_dict = {}
for value in names_duplicates:
    if len(value) > 1:
        duplicate_name_dict[value[0]] = value[1:]

In [14]:
names_set.set_index(['names'], inplace=True)
names_set.head()


Out[14]:
len tuple setlen set_per_len
names
addibischoffite 15 (a, b, c, d, e, f, h, i, o, s, t) 11 0.733333
molybdofornacite 16 (a, b, c, d, e, f, i, l, m, n, o, r, t, y) 14 0.875000
burckhardtite 13 (a, b, c, d, e, h, i, k, r, t, u) 11 0.846154
buchwaldite 11 (a, b, c, d, e, h, i, l, t, u, w) 11 1.000000
medenbachite 12 (a, b, c, d, e, h, i, m, n, t) 10 0.833333

Create letter table


In [15]:
letter_df = pd.DataFrame(index=names_set.index, columns=list(string.ascii_lowercase), dtype=bool)
letter_df.loc[:] = False

In [16]:
%%time
for name, set_ in zip(names_set.index, names_set['tuple']):
    for letter in set_:
        letter_df.loc[name, letter] = True


Wall time: 11.2 s

Find argmin in the letter distribution


In [17]:
lowest_count_letter = letter_df.sum(0).argmin()
lowest_count_letter


Out[17]:
'q'

In [18]:
# Get subset based on the chosen letter
subsetlen = letter_df[letter_df[lowest_count_letter]].sum(1)
name_len = subsetlen.index.str.len()

setlen = pd.DataFrame({'set_per_len' : subsetlen/name_len, 'len' : name_len})
setlen.head()


Out[18]:
len set_per_len
names
aluminocoquimbite 17 0.705882
barquillite 11 0.818182
jacquesdietrichite 18 0.666667
coquandite 10 1.000000
quetzalcoatlite 15 0.666667

Recursion


In [58]:
def get_min_set(df, current_items, m=46, sort_by_len=False, n_search=20):
    # Gather results
    results = []
    # Get letter with lowest number of options
    letter = df.sum(0)
    letter = letter[letter > 0].argmin()
    # Get subset based on the chosen letter
    subsetlen = df.loc[df[letter], :].sum(1)
    name_len = subsetlen.index.str.len()
    
    setlen = pd.DataFrame({'set_per_len' : subsetlen/name_len, 'len' : name_len})
    if sort_by_len:
        order_of_operations = setlen.sort_values(by=['len', 'set_per_len'], ascending=True).index
    else:
        order_of_operations = setlen.sort_values(by=['set_per_len', 'len'], ascending=False).index
    # Loop over the mineral names with chosen letter
    # Ordered based on the (setlen / len)
    for i, (name, letter_bool) in enumerate(df.loc[order_of_operations, :].iterrows()):
        if i > n_search:
            break
        if sum(map(len, current_items))+len(name) >= m:
            continue
        # Get df containing rest of the letters
        df_ = df.copy()
        df_.loc[:, letter_bool] = False
        
        # If letters are exhausted there is one result
        # Check if the result is less than chosen limit m
        if df_.sum(0).sum() == 0 and sum(map(len, current_items))+len(name) < m:
            # This result is "the most optimal" under these names
            current_items_ = current_items + [name]
            len_current_items_ = sum(map(len, current_items_))
            len_unique = len(set("".join(current_items_)))
            results.append((len_current_items_, current_items_))
            if len_current_items_ < 41:
                print("len", len_current_items_, "len_unique", len_unique, current_items_, "place 1", flush=True)
            continue
        
        # Remove mineral names without new letters 
        df_ = df_.loc[df_.sum(1) != 0, :]
        
        if df_.sum(0).sum() == 0:
            if sum(map(len, current_items))+len(name) < m:
                unique_letters = sum(map(len, map(set, current_items + [name])))
                if unique_letters == len(string.ascii_lowercase):
                    # Here is one result (?)
                    current_items_ = current_items + [name]
                    len_current_items_ = sum(map(len, current_items_))
                    len_unique = len(set("".join(current_items_)))
                    results.append((len_current_items_, current_items_))
                    if len_current_items_ < 41:
                        print("len", len_current_items_, "len_unique", len_unique, current_items_, "place 1", flush=True)
            continue
        
        current_items_ = current_items + [name]
        optimal_result = get_min_set(df_, current_items_, m=m, sort_by_len=sort_by_len, n_search=n_search)
        if len(optimal_result):
            results.extend(optimal_result)
    return results

The effective ratio criteria


In [59]:
%%time
res_list = []
order_of_oparations = setlen.loc[letter_df.loc[:, lowest_count_letter], :].sort_values(by=['set_per_len', 'len'], ascending=False).index

for i, (name, letter_bool) in enumerate(letter_df.ix[order_of_oparations].iterrows()):
    print(name, i+1, "/", len(order_of_oparations), flush=True)
    df_ = letter_df.copy()
    df_.loc[:, letter_bool] = False
    res = get_min_set(df_, [name], m=45, sort_by_len=False, n_search=20)
    res_list.extend(res)


coquandite 1 / 40
qusongite 2 / 40
naquite 3 / 40
quartz 4 / 40
len 40 len_unique 26 ['quartz', 'vanoxite', 'jedwabite', 'fukuchilite', 'gypsum'] place 1
len 40 len_unique 26 ['quartz', 'xifengite', 'jedwabite', 'chkalovite', 'gypsum'] place 1
goldquarryite 5 / 40
quadruphite 6 / 40
coquimbite 7 / 40
qandilite 8 / 40
turquoise 9 / 40
quadridavyne 10 / 40
mapiquiroite 11 / 40
holmquistite 12 / 40
barquillite 13 / 40
macquartite 14 / 40
quijarroite 15 / 40
colquiriite 16 / 40
quadratite 17 / 40
quenselite 18 / 40
penobsquisite 19 / 40
pirquitasite 20 / 40
lindqvistite 21 / 40
nesquehonite 22 / 40
taseqite 23 / 40
qingsongite 24 / 40
rodalquilarite 25 / 40
aluminocoquimbite 26 / 40
manganoquadratite 27 / 40
qatranaite 28 / 40
qingheiite 29 / 40
esquireite 30 / 40
qilianshanite 31 / 40
jacquesdietrichite 32 / 40
quetzalcoatlite 33 / 40
tranquillityite 34 / 40
quenstedtite 35 / 40
queitite 36 / 40
qitianlingite 37 / 40
arsenquatrandorite 38 / 40
strontiojoaquinite 39 / 40
quintinite 40 / 40
Wall time: 2h 12min 33s

In [60]:
res_df = pd.DataFrame([[item[0]] + item[1] for item in res_list]).sort_values(by=0)

In [61]:
res_df.head()


Out[61]:
0 1 2 3 4 5 6 7
140 40 quartz vanoxite jedwabite fukuchilite gypsum None None
231 40 quartz xifengite jedwabite chkalovite gypsum None None
64 41 quartz wilcoxite vajdakite hafnon beryl gypsum None
144 41 quartz vanoxite jedwabite feklichevite gypsum None None
109 41 quartz haxonite jedwabite feklichevite gypsum None None

The shortest name length criteria


In [64]:
%%time
res_list_ = []
order_of_oparations = setlen.loc[letter_df.loc[:, lowest_count_letter], :].sort_values(by=['set_per_len', 'len'], ascending=False).index

for i, (name, letter_bool) in enumerate(letter_df.ix[order_of_oparations].iterrows()):
    print(name, i+1, "/", len(order_of_oparations), flush=True)
    df_ = letter_df.copy()
    df_.loc[:, letter_bool] = False
    res_ = get_min_set(df_, [name], m=45, sort_by_len=True, n_search=20)
    res_list_.extend(res_)


 
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
KeyboardInterrupt: 

In [72]:
#res_df_ = pd.DataFrame([[item[0]] + item[1] for item in res_list_]).sort_values(by=0)

In [81]:
res_df.shape #, res_df_.shape


Out[81]:
(287, 8)

Save the results


In [82]:
%time res_df.to_csv("./example_but_not_optimum_no_duplicates.csv")


Wall time: 15.6 ms

In [83]:
optimum = res_df[res_df[0] == res_df.iloc[0, 0]]

Check for duplicates


In [84]:
optimum.iloc[:, 1:].applymap(lambda x: duplicate_name_dict.get(x, None))


Out[84]:
1 2 3 4 5 6 7
140 None None None None None None None
231 None None None None None None None

In [85]:
optimum


Out[85]:
0 1 2 3 4 5 6 7
140 40 quartz vanoxite jedwabite fukuchilite gypsum None None
231 40 quartz xifengite jedwabite chkalovite gypsum None None

Validate results


In [86]:
optimum.apply(lambda x: "".join(sorted(set("".join(x.iloc[1:6].values)))) == string.ascii_lowercase, axis=1)


Out[86]:
140    True
231    True
dtype: bool

In [ ]: