In [1]:
import os
import re
import pickle
import time
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
%matplotlib inline
# Custom modules
import const
import func
In [2]:
# Load look-up table
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head()
Out[2]:
In [3]:
lut.station_V2.unique()
Out[3]:
In [14]:
lut.groupby('station_V2').station_V2.first().reset_index(drop=True).values.shape
Out[14]:
In [16]:
# Load unique paths and covert them to dataframe
paths = pd.read_csv(os.path.join(const.DATA_PATH, 'eda_product_flow_unique_paths_station.csv'), index_col=0)
# Convert strings to int array (could not find a way to split the characters in string other than this...)
paths = (paths.u_str.str.replace('0','0,').str.replace('1','1,').str.split(',')).apply(pd.Series)
# Drop last column (residue after split)
paths.drop(paths.columns[-1], axis=1, inplace=True)
# To ints
paths = paths.astype(int)
# Replace index by station_V2 numbers
paths.columns = lut.groupby('station_V2').station_V2.first().reset_index(drop=True).values
paths.head(3)
Out[16]:
In [17]:
paths.shape
Out[17]:
In [18]:
# Load unique path per sample ID
string_id_to_pid = pd.read_csv(os.path.join(const.DATA_PATH, 'eda_product_flow_sample_paths_station.csv'))
In [19]:
string_id_to_pid.head(3)
Out[19]:
In [20]:
# Load response
y=func.read_last_column('train_numeric.csv')#[:500000]
y.head(3)
Out[20]:
In [21]:
In [28]:
def cluster_unique_paths(unique_paths, n_clus):
''' Function to cluster set of unique paths '''
# Cluster using Kmeans
km = KMeans(n_clus, n_init=100, max_iter=100, random_state=11111)
km.fit(unique_paths)
# Return DataFrame
clusters = pd.DataFrame(pd.Series(km.labels_),
index=unique_paths.index,
columns=['cluster'])
return clusters
def visualize_cluster_result(unique_paths, clusters, sample_to_path, y):
nclus = clusters['cluster'].nunique()
if 'id' in sample_to_path.columns:
sample_to_path.set_index('id')
cluster_per_sample = sample_to_path.merge(unique_paths,
left_on='u_arr_ix',
right_index=True,
how='left') \
.merge(clusters,
left_on='u_arr_ix',
right_index=True,
how='left') \
.drop(['u_arr_ix'], axis=1) \
.set_index('id')
cluster_grouped = cluster_per_sample.groupby('cluster').mean()
# Fix weird rounding bug
cluster_grouped.columns = [round(n,3) for n in cluster_grouped.columns]
# Get sorting index based on major thresholds
try:
so = (cluster_grouped>0.85).sort_values([0.0, 12.0, 24.1, 24.2, 24.3, 25.1, 25.202, 25.212, 25.222, 26.0, 27.0])
except:
print('Error sorting values')
so = (cluster_grouped>0.85)
# Visualize major stations per cluster
plt.figure(figsize=(16,6))
gs = gridspec.GridSpec(1, 3,width_ratios=[5,1,1])
ax2 = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
ax3 = plt.subplot(gs[2])
ax1.tick_params(
axis='both', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='on', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='on',
labelleft='off') # labels along the bottom edge are off
ax2.tick_params(
axis='both', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off',
labelleft='on') # labels along the bottom edge are off
ax3.tick_params(
axis='both', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='on', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='on',
labelleft='off') # labels along the bottom edge are off
sns.heatmap(cluster_per_sample.groupby('cluster').mean().loc[so.index,:], cmap='Blues', ax=ax2, cbar=False)
ax2.set_title('Number of clusters: {}'.format(nclus))
cluster_per_sample['R'] = y
# Create dataframe with results
cluster_summary = pd.DataFrame({'meanR': cluster_per_sample.groupby('cluster').R.mean(),
'sample_count': cluster_per_sample.groupby('cluster').R.count()},
index=range(nclus))
cluster_summary.sort_values('sample_count', ascending=False, inplace=True)
# Plot error rates (trick to plot from right to left)
(cluster_summary['meanR']*100).loc[so.index[::-1]].plot(kind='barh', ax=ax1)
labs = [1*x for x in ax1.get_xticks().tolist()]
ax1.set_xticklabels(labs, rotation='vertical')
ax1.set_xlabel('Error rate (%)')
ax1.set_xlim([0, ax1.get_xlim()[1]])
ax1.set_ylabel('')
# Plot sample count
cluster_summary['sample_count'].loc[so.index[::-1]].apply(lambda x: np.log10(x)).plot(kind='barh', ax=ax3)
ax3.set_xlabel('log10(Sample count)')
ax3.set_ylabel('')
plt.tight_layout()
return cluster_summary
In [29]:
clusters_8 = cluster_unique_paths(paths, 8)
summary_8 = visualize_cluster_result(paths, clusters_8, string_id_to_pid, y)
In [31]:
clusters_15 = cluster_unique_paths(paths, 15)
summary_15 = visualize_cluster_result(paths, clusters_15, string_id_to_pid, y)
In [30]:
clusters_25 = cluster_unique_paths(paths, 25)
summary_25 = visualize_cluster_result(paths, clusters_25, string_id_to_pid, y)
In [32]:
clusters_50 = cluster_unique_paths(paths, 50)
summary_50 = visualize_cluster_result(paths, clusters_50, string_id_to_pid, y)
In [33]:
clusters_100 = cluster_unique_paths(paths, 100)
summary_100 = visualize_cluster_result(paths, clusters_100, string_id_to_pid, y)
In [34]:
clusters_150 = cluster_unique_paths(paths, 150)
summary_150 = visualize_cluster_result(paths, clusters_150, string_id_to_pid, y)
In [35]:
clusters_500 = cluster_unique_paths(paths, 500)
summary_500 = visualize_cluster_result(paths, clusters_500, string_id_to_pid, y)
In [43]:
pid_to_cluster = string_id_to_pid.merge(clusters_8,
left_on='u_arr_ix',
right_index=True,
how='left')
pid_to_cluster = pid_to_cluster.merge(clusters_15,
left_on='u_arr_ix',
right_index=True,
how='left')
pid_to_cluster = pid_to_cluster.merge(clusters_25,
left_on='u_arr_ix',
right_index=True,
how='left')
pid_to_cluster = pid_to_cluster.merge(clusters_50,
left_on='u_arr_ix',
right_index=True,
how='left')
pid_to_cluster = pid_to_cluster.merge(clusters_150,
left_on='u_arr_ix',
right_index=True,
how='left')
pid_to_cluster = pid_to_cluster.merge(clusters_500,
left_on='u_arr_ix',
right_index=True,
how='left')
pid_to_cluster.set_index('id', inplace=True)
pid_to_cluster.columns = ['unique_path','cluster_n8','cluster_n15','cluster_n25',
'cluster_n50', 'cluster_n150', 'cluster_n500']
In [44]:
pid_to_cluster.head(3)
Out[44]:
In [45]:
pid_to_cluster.to_csv(os.path.join(const.DATA_PATH, 'eda_sample_clusters.csv'), index_label='Id')
In [46]:
summary_500.to_csv(os.path.join(const.DATA_PATH, 'eda_summary_cluster_500.csv'), index_label='cluster')
In [480]:
paths.columns.get_loc(26.)
Out[480]:
In [490]:
clusters_line_3 = cluster_unique_paths(paths.iloc[:, 103:], 100)
summary_line_3 = visualize_cluster_result(paths.iloc[:, 103:], clusters_line_3, string_id_to_pid, y)
In [150]:
pid_to_cluster = pd.read_csv(os.path.join(const.DATA_PATH, 'eda_sample_clusters.csv'), index_col=0)
pid_to_cluster['R']=y
pid_to_cluster=pid_to_cluster[~pid_to_cluster.R.isnull()]
pid_to_cluster.head(3)
Out[150]:
In [174]:
for col in pid_to_cluster.columns[:-1]:
cnt_agg = pid_to_cluster.groupby(col).agg({col:'count','R':'mean'})
cnt_agg[col] = np.log10(cnt_agg[col])
cnt_agg.set_index(col, drop=True, inplace=True)
cnt_agg.sort_index(inplace=True)
plt.figure(figsize=(16,4))
cnt_agg.plot(kind='bar', figsize=(16,4))
In [168]:
cnt_agg.plot(kind='bar')
Out[168]:
In [49]:
clust_high = summary_500[summary_500.meanR>0.008]
print clust_high.shape
clust_high.head(3)
Out[49]:
In [50]:
cluster_per_sample_500 = string_id_to_pid.merge(paths,
left_on='u_arr_ix',
right_index=True,
how='left') \
.merge(clusters_500,
left_on='u_arr_ix',
right_index=True,
how='left') \
.drop(['u_arr_ix','id'], axis=1)
In [65]:
exa=cluster_per_sample_500[(cluster_per_sample_500.iloc[:, 12]==1) & (cluster_per_sample_500.iloc[:, 0]==1)]
In [69]:
exa_high = exa[exa.cluster.isin(clust_high.index)]
exa_low = exa[~exa.cluster.isin(clust_high.index)]
In [72]:
exa_high.head(3)
Out[72]:
In [82]:
exa_high.shape
Out[82]:
In [117]:
exa_low.shape
Out[117]:
In [138]:
n_s = 0
n_e = 12
diffs = []
for j in range(150):
mean_high = float((exa_high.iloc[:,(n_s+j):(n_e+j)]>0).sum().sum())/218
mean_low = float((exa_low.iloc[:,(n_s+j):(n_e+j)]>0).sum().sum())/1317
diffs.append(np.divide(mean_high,mean_low))
#print mean_low
#print mean_high
#print np.divide(mean_high,mean_low)
#print ('')
print(max(diffs))
print(min(diffs))
In [83]:
exa_low.shape
Out[83]:
In [85]:
Out[85]:
In [76]:
exa_high.mean().iloc[:-1].plot(kind='bar', figsize=(16,4))
Out[76]:
In [78]:
(exa_high.mean().iloc[:-1] / exa_low.mean().iloc[:-1]).plot(kind='bar', figsize=(16,4))
Out[78]:
In [56]:
mean_per_cluster = cluster_per_sample_500.groupby('cluster').mean()
In [57]:
mean_per_cluster.head(3)
Out[57]:
In [ ]:
In [ ]:
mean_per_cluster
In [61]:
(mean_per_cluster>0).sum(1).hist(bins=50)
Out[61]:
In [28]:
cluster_per_sample_100.columns
Out[28]:
In [53]:
cluster_per_sample_500.iloc[:, :-1].mean().plot(kind='bar', figsize=(13,4))
Out[53]:
In [54]:
cluster_per_sample_500.loc[clust_high.index].iloc[:, :-1].mean().plot(kind='bar', figsize=(13,4))
Out[54]:
In [55]:
(cluster_per_sample_500.loc[clust_high.index].iloc[:, :-1].mean()/cluster_per_sample_500.iloc[:, :-1].mean()).plot(kind='bar', figsize=(13,4))
Out[55]:
In [108]:
clusters_line_3 = cluster_unique_paths(paths.iloc[:, 107:112], 10)
summary_line_3 = visualize_cluster_result(paths.iloc[:, 107:112], clusters_line_3, string_id_to_pid, y)