In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import string
Get data
In [2]:
names_df = pd.read_csv("./IMA_mineral_names.txt", sep=',', header=None, names=['names'])
names_df['names'] = names_df['names'].str.strip().str.lower()
In [3]:
names_df['len'] = names_df['names'].str.len()
In [4]:
names_df['tuple'] = names_df['names'].apply(lambda x: tuple(sorted(set(x))))
In [5]:
names_df['setlen'] = names_df['tuple'].apply(lambda x: len(x))
In [6]:
names_df['set_per_len'] = names_df['setlen']/names_df['len']
In [7]:
names_df.head(5)
Out[7]:
In [8]:
len(names_df)
Out[8]:
In [9]:
def sort_and_return_smallest(df):
if len(df) == 1:
return df
df = df.sort_values(by=['len', 'names'])
return df.iloc[:1, :]
In [10]:
%time names_set = names_df.groupby(by='tuple', as_index=False).apply(sort_and_return_smallest)
len(names_set)
Out[10]:
In [11]:
def sort_and_return_smallest_duplicates(df):
if len(df) == 1:
return list(df['names'])
df = df.sort_values(by=['len', 'names'])
names = df.loc[df['len'] == df['len'].iloc[0], 'names']
return list(names)
In [12]:
%time names_duplicates = names_df.groupby(by='tuple', as_index=False).apply(sort_and_return_smallest_duplicates)
len(names_duplicates)
Out[12]:
In [13]:
# In case some of these are in the chosen set
duplicate_name_dict = {}
for value in names_duplicates:
if len(value) > 1:
duplicate_name_dict[value[0]] = value[1:]
In [14]:
names_set.set_index(['names'], inplace=True)
names_set.head()
Out[14]:
In [15]:
letter_df = pd.DataFrame(index=names_set.index, columns=list(string.ascii_lowercase), dtype=bool)
letter_df.loc[:] = False
In [16]:
%%time
for name, set_ in zip(names_set.index, names_set['tuple']):
for letter in set_:
letter_df.loc[name, letter] = True
In [17]:
lowest_count_letter = letter_df.sum(0).argmin()
lowest_count_letter
Out[17]:
In [18]:
# Get subset based on the chosen letter
subsetlen = letter_df[letter_df[lowest_count_letter]].sum(1)
name_len = subsetlen.index.str.len()
setlen = pd.DataFrame({'set_per_len' : subsetlen/name_len, 'len' : name_len})
setlen.head()
Out[18]:
In [58]:
def get_min_set(df, current_items, m=46, sort_by_len=False, n_search=20):
# Gather results
results = []
# Get letter with lowest number of options
letter = df.sum(0)
letter = letter[letter > 0].argmin()
# Get subset based on the chosen letter
subsetlen = df.loc[df[letter], :].sum(1)
name_len = subsetlen.index.str.len()
setlen = pd.DataFrame({'set_per_len' : subsetlen/name_len, 'len' : name_len})
if sort_by_len:
order_of_operations = setlen.sort_values(by=['len', 'set_per_len'], ascending=True).index
else:
order_of_operations = setlen.sort_values(by=['set_per_len', 'len'], ascending=False).index
# Loop over the mineral names with chosen letter
# Ordered based on the (setlen / len)
for i, (name, letter_bool) in enumerate(df.loc[order_of_operations, :].iterrows()):
if i > n_search:
break
if sum(map(len, current_items))+len(name) >= m:
continue
# Get df containing rest of the letters
df_ = df.copy()
df_.loc[:, letter_bool] = False
# If letters are exhausted there is one result
# Check if the result is less than chosen limit m
if df_.sum(0).sum() == 0 and sum(map(len, current_items))+len(name) < m:
# This result is "the most optimal" under these names
current_items_ = current_items + [name]
len_current_items_ = sum(map(len, current_items_))
len_unique = len(set("".join(current_items_)))
results.append((len_current_items_, current_items_))
if len_current_items_ < 41:
print("len", len_current_items_, "len_unique", len_unique, current_items_, "place 1", flush=True)
continue
# Remove mineral names without new letters
df_ = df_.loc[df_.sum(1) != 0, :]
if df_.sum(0).sum() == 0:
if sum(map(len, current_items))+len(name) < m:
unique_letters = sum(map(len, map(set, current_items + [name])))
if unique_letters == len(string.ascii_lowercase):
# Here is one result (?)
current_items_ = current_items + [name]
len_current_items_ = sum(map(len, current_items_))
len_unique = len(set("".join(current_items_)))
results.append((len_current_items_, current_items_))
if len_current_items_ < 41:
print("len", len_current_items_, "len_unique", len_unique, current_items_, "place 1", flush=True)
continue
current_items_ = current_items + [name]
optimal_result = get_min_set(df_, current_items_, m=m, sort_by_len=sort_by_len, n_search=n_search)
if len(optimal_result):
results.extend(optimal_result)
return results
In [59]:
%%time
res_list = []
order_of_oparations = setlen.loc[letter_df.loc[:, lowest_count_letter], :].sort_values(by=['set_per_len', 'len'], ascending=False).index
for i, (name, letter_bool) in enumerate(letter_df.ix[order_of_oparations].iterrows()):
print(name, i+1, "/", len(order_of_oparations), flush=True)
df_ = letter_df.copy()
df_.loc[:, letter_bool] = False
res = get_min_set(df_, [name], m=45, sort_by_len=False, n_search=20)
res_list.extend(res)
In [60]:
res_df = pd.DataFrame([[item[0]] + item[1] for item in res_list]).sort_values(by=0)
In [61]:
res_df.head()
Out[61]:
In [64]:
%%time
res_list_ = []
order_of_oparations = setlen.loc[letter_df.loc[:, lowest_count_letter], :].sort_values(by=['set_per_len', 'len'], ascending=False).index
for i, (name, letter_bool) in enumerate(letter_df.ix[order_of_oparations].iterrows()):
print(name, i+1, "/", len(order_of_oparations), flush=True)
df_ = letter_df.copy()
df_.loc[:, letter_bool] = False
res_ = get_min_set(df_, [name], m=45, sort_by_len=True, n_search=20)
res_list_.extend(res_)
In [72]:
#res_df_ = pd.DataFrame([[item[0]] + item[1] for item in res_list_]).sort_values(by=0)
In [81]:
res_df.shape #, res_df_.shape
Out[81]:
In [82]:
%time res_df.to_csv("./example_but_not_optimum_no_duplicates.csv")
In [83]:
optimum = res_df[res_df[0] == res_df.iloc[0, 0]]
In [84]:
optimum.iloc[:, 1:].applymap(lambda x: duplicate_name_dict.get(x, None))
Out[84]:
In [85]:
optimum
Out[85]:
In [86]:
optimum.apply(lambda x: "".join(sorted(set("".join(x.iloc[1:6].values)))) == string.ascii_lowercase, axis=1)
Out[86]:
In [ ]: