In [1]:
import numpy as np
import pandas as pd
In [2]:
with open("./IMA_mineral_names.txt", 'r') as f:
names = [line.strip().lower() for line in f]
In [3]:
df = pd.DataFrame({'name' : names})
In [4]:
df['set'] = df['name'].apply(set)
df['letter_set'] = df['name'].apply(lambda x: "".join(sorted(set(x))))
df['letter_len'] = df['name'].apply(len)
df['set_len'] = df['set'].apply(len)
In [5]:
df.head()
Out[5]:
In [6]:
%matplotlib inline
ax = pd.DataFrame.from_dict({num : (np.array(list(map(len, df['letter_set'].unique()))) <= num).sum() for num in range(3,16)}, orient='index').plot.bar(width=0.9)
ax.axhline(50, color='k', label='cutoff')
ax.legend()
ax.set_yscale('log')
{num : (np.array(list(map(len, df['letter_set'].unique()))) <= num).sum() for num in range(3,16)}
Out[6]:
In [7]:
%%time
combinations = {}
n = len(df['letter_set'].unique())
# skip combinations if set_len > 4 (40 first items) or combination set is over 15
len_and_letters = sorted(zip(list(map(len, df['letter_set'].unique())), df['letter_set'].unique()), reverse=False)
for i, (len_let, letters) in enumerate(len_and_letters, 1):
if len_let > 4 and i > 40:
if letters not in combinations:
combinations[letters] = letters_set
continue
print("\r{} / {} --> {}".format(i, n, len(combinations)), end='')
letters_set = set(letters)
for comb, comb_set in combinations.copy().items():
union = letters_set.union(comb_set)
if len(union) > 15:
continue
union_str = "".join(sorted(union))
if union_str not in combinations:
combinations[union_str] = union
if letters not in combinations:
combinations[letters] = letters_set
print()
len(combinations)
In [8]:
%%time
subset_groups = {key : [] for key in combinations.keys()}
n = len(df['letter_set'])
for i, group in enumerate(np.sort(df['letter_set']),1):
print("\r{} / {} --> {}".format(i, n, len(subset_groups)), end='')
mineral_names = list(df.loc[df['letter_set'] == group, 'name'].values)
group_set = set(group)
# check if minerals can be put inside different subgroups
for key, let_set in combinations.items():
if let_set.issuperset(group_set):
subset_groups[key].extend(mineral_names)
print()
subset_groups = {key : sorted(set(values)) for key, values in subset_groups.items()}
In [9]:
subset_groups_len = {}
subset_groups_relative_len = {}
subset_len = {}
for key in sorted(subset_groups, reverse=True):
subset_len[key] = len(key)
subset_groups_len[key] = len(subset_groups[key])
subset_groups_relative_len[key] = len(subset_groups[key])/len(key)
In [10]:
df_subsets = pd.DataFrame.from_dict({
'subset_groups' : subset_groups,
'len' : subset_len,
'set_len' : subset_groups_len,
'relative_len' : subset_groups_relative_len
}).reset_index(drop=False)
df_subsets.rename(columns={'index' : 'name'}, inplace=True)
In [11]:
df_subsets.sort_values(by='relative_len', ascending=False).head(50)
Out[11]:
In [12]:
max_group_for_size = df_subsets.groupby(by='len').apply(lambda x: pd.Series({'group_len' : x['set_len'].max(),
'group' : x.loc[x['set_len'].argmax(), 'name'],
'list_group' : x.loc[x['set_len'].argmax(), 'subset_groups'],
'relative_len' : x['set_len'].max()/len(x.loc[x['set_len'].argmax(), 'name'])
})).reset_index(drop=False)
max_group_for_size['group_and_len'] = max_group_for_size.apply(lambda x: "{} - {}".format(x['group'], x['len']), axis=1)
max_group_for_size
Out[12]:
In [13]:
max_group_len = {}
for index, row in max_group_for_size.iterrows():
max_group_len[row['len']] = row['list_group']
if row['group_len'] < 100:
print()
print(row['group'])
print(row['list_group'])
In [14]:
max_group_for_size_and_friends = df_subsets.groupby(by='len').apply(lambda x: pd.DataFrame({'group_len' : x.loc[x['set_len'] == x['set_len'].max(),
'set_len'],
'group' : x.loc[x['set_len'] == x['set_len'].max(),
'name'],
'list_group' : x.loc[x['set_len'] == x['set_len'].max(),
'subset_groups'],
'relative_len' : x.loc[x['set_len'] == x['set_len'].max(),
'set_len'] / \
x.loc[x['set_len'] == x['set_len'].max(),
'name'].apply(len),
})).reset_index(drop=False)
max_group_for_size_and_friends.drop(['level_1'], axis=1, inplace=True)
max_group_for_size_and_friends['group_and_len'] = max_group_for_size_and_friends.apply(lambda x: "{} - {}".format(x['group'], x['len']), axis=1)
max_group_for_size_and_friends
Out[14]:
In [15]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [16]:
fig = plt.figure(figsize=(14,5))
ax = plt.subplot(121)
ax = max_group_for_size.plot.bar(x='group_and_len', y='group_len', width=0.9, ax=ax)
ax.set_yscale('log')
ax = plt.subplot(122)
ax = max_group_for_size.plot.bar(x='group_and_len', y='relative_len', width=0.9, ax=ax, color='royalblue')
ax.set_yscale('log')
In [17]:
fig = plt.figure(figsize=(14,5))
ax = plt.subplot(121)
ax = max_group_for_size_and_friends.plot.bar(x='group_and_len', y='group_len', width=0.9, ax=ax)
ax.set_yscale('log')
ax = plt.subplot(122)
ax = max_group_for_size_and_friends.plot.bar(x='group_and_len', y='relative_len', width=0.9, ax=ax, color='royalblue')
ax.set_yscale('log')