In [1]:
    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import string
    
Get data
In [2]:
    
names_df = pd.read_csv("./IMA_mineral_names.txt", sep=',', header=None, names=['names'])
names_df['names'] = names_df['names'].str.strip().str.lower()
    
In [3]:
    
names_df['len'] = names_df['names'].str.len()
    
In [4]:
    
names_df['tuple'] = names_df['names'].apply(lambda x: tuple(sorted(set(x))))
    
In [5]:
    
names_df['setlen'] = names_df['tuple'].apply(lambda x: len(x))
    
In [6]:
    
names_df['set_per_len'] = names_df['setlen']/names_df['len']
    
In [7]:
    
names_df.head(5)
    
    Out[7]:
In [8]:
    
len(names_df)
    
    Out[8]:
In [9]:
    
def sort_and_return_smallest(df):
    if len(df) == 1:
        return df
    df = df.sort_values(by=['len', 'names'])
    return df.iloc[:1, :]
    
In [10]:
    
%time names_set = names_df.groupby(by='tuple', as_index=False).apply(sort_and_return_smallest)
len(names_set)
    
    
    Out[10]:
In [11]:
    
def sort_and_return_smallest_duplicates(df):
    if len(df) == 1:
        return list(df['names'])
    df = df.sort_values(by=['len', 'names'])
    names = df.loc[df['len'] == df['len'].iloc[0], 'names']
    return list(names)
    
In [12]:
    
%time names_duplicates = names_df.groupby(by='tuple', as_index=False).apply(sort_and_return_smallest_duplicates)
len(names_duplicates)
    
    
    Out[12]:
In [13]:
    
# In case some of these are in the chosen set
duplicate_name_dict = {}
for value in names_duplicates:
    if len(value) > 1:
        duplicate_name_dict[value[0]] = value[1:]
    
In [14]:
    
names_set.set_index(['names'], inplace=True)
names_set.head()
    
    Out[14]:
In [15]:
    
letter_df = pd.DataFrame(index=names_set.index, columns=list(string.ascii_lowercase), dtype=bool)
letter_df.loc[:] = False
    
In [16]:
    
%%time
for name, set_ in zip(names_set.index, names_set['tuple']):
    for letter in set_:
        letter_df.loc[name, letter] = True
    
    
In [17]:
    
lowest_count_letter = letter_df.sum(0).argmin()
lowest_count_letter
    
    Out[17]:
In [18]:
    
# Get subset based on the chosen letter
subsetlen = letter_df[letter_df[lowest_count_letter]].sum(1)
name_len = subsetlen.index.str.len()
setlen = pd.DataFrame({'set_per_len' : subsetlen/name_len, 'len' : name_len})
setlen.head()
    
    Out[18]:
In [58]:
    
def get_min_set(df, current_items, m=46, sort_by_len=False, n_search=20):
    # Gather results
    results = []
    # Get letter with lowest number of options
    letter = df.sum(0)
    letter = letter[letter > 0].argmin()
    # Get subset based on the chosen letter
    subsetlen = df.loc[df[letter], :].sum(1)
    name_len = subsetlen.index.str.len()
    
    setlen = pd.DataFrame({'set_per_len' : subsetlen/name_len, 'len' : name_len})
    if sort_by_len:
        order_of_operations = setlen.sort_values(by=['len', 'set_per_len'], ascending=True).index
    else:
        order_of_operations = setlen.sort_values(by=['set_per_len', 'len'], ascending=False).index
    # Loop over the mineral names with chosen letter
    # Ordered based on the (setlen / len)
    for i, (name, letter_bool) in enumerate(df.loc[order_of_operations, :].iterrows()):
        if i > n_search:
            break
        if sum(map(len, current_items))+len(name) >= m:
            continue
        # Get df containing rest of the letters
        df_ = df.copy()
        df_.loc[:, letter_bool] = False
        
        # If letters are exhausted there is one result
        # Check if the result is less than chosen limit m
        if df_.sum(0).sum() == 0 and sum(map(len, current_items))+len(name) < m:
            # This result is "the most optimal" under these names
            current_items_ = current_items + [name]
            len_current_items_ = sum(map(len, current_items_))
            len_unique = len(set("".join(current_items_)))
            results.append((len_current_items_, current_items_))
            if len_current_items_ < 41:
                print("len", len_current_items_, "len_unique", len_unique, current_items_, "place 1", flush=True)
            continue
        
        # Remove mineral names without new letters 
        df_ = df_.loc[df_.sum(1) != 0, :]
        
        if df_.sum(0).sum() == 0:
            if sum(map(len, current_items))+len(name) < m:
                unique_letters = sum(map(len, map(set, current_items + [name])))
                if unique_letters == len(string.ascii_lowercase):
                    # Here is one result (?)
                    current_items_ = current_items + [name]
                    len_current_items_ = sum(map(len, current_items_))
                    len_unique = len(set("".join(current_items_)))
                    results.append((len_current_items_, current_items_))
                    if len_current_items_ < 41:
                        print("len", len_current_items_, "len_unique", len_unique, current_items_, "place 1", flush=True)
            continue
        
        current_items_ = current_items + [name]
        optimal_result = get_min_set(df_, current_items_, m=m, sort_by_len=sort_by_len, n_search=n_search)
        if len(optimal_result):
            results.extend(optimal_result)
    return results
    
In [59]:
    
%%time
res_list = []
order_of_oparations = setlen.loc[letter_df.loc[:, lowest_count_letter], :].sort_values(by=['set_per_len', 'len'], ascending=False).index
for i, (name, letter_bool) in enumerate(letter_df.ix[order_of_oparations].iterrows()):
    print(name, i+1, "/", len(order_of_oparations), flush=True)
    df_ = letter_df.copy()
    df_.loc[:, letter_bool] = False
    res = get_min_set(df_, [name], m=45, sort_by_len=False, n_search=20)
    res_list.extend(res)
    
    
In [60]:
    
res_df = pd.DataFrame([[item[0]] + item[1] for item in res_list]).sort_values(by=0)
    
In [61]:
    
res_df.head()
    
    Out[61]:
In [64]:
    
%%time
res_list_ = []
order_of_oparations = setlen.loc[letter_df.loc[:, lowest_count_letter], :].sort_values(by=['set_per_len', 'len'], ascending=False).index
for i, (name, letter_bool) in enumerate(letter_df.ix[order_of_oparations].iterrows()):
    print(name, i+1, "/", len(order_of_oparations), flush=True)
    df_ = letter_df.copy()
    df_.loc[:, letter_bool] = False
    res_ = get_min_set(df_, [name], m=45, sort_by_len=True, n_search=20)
    res_list_.extend(res_)
    
    
    
In [72]:
    
#res_df_ = pd.DataFrame([[item[0]] + item[1] for item in res_list_]).sort_values(by=0)
    
In [81]:
    
res_df.shape #, res_df_.shape
    
    Out[81]:
In [82]:
    
%time res_df.to_csv("./example_but_not_optimum_no_duplicates.csv")
    
    
In [83]:
    
optimum = res_df[res_df[0] == res_df.iloc[0, 0]]
    
In [84]:
    
optimum.iloc[:, 1:].applymap(lambda x: duplicate_name_dict.get(x, None))
    
    Out[84]:
In [85]:
    
optimum
    
    Out[85]:
In [86]:
    
optimum.apply(lambda x: "".join(sorted(set("".join(x.iloc[1:6].values)))) == string.ascii_lowercase, axis=1)
    
    Out[86]:
In [ ]: