In [1]:

    
import pandas as pd
import seaborn as sns
from sklearn.linear_model import RidgeCV
import numpy as np
from sklearn import cluster, metrics



In [2]:

    
from matplotlib import pyplot as plt
%matplotlib inline



In [3]:

    
data_path = '../../data/cleaned/UCB_dept_merge.csv'

Modified data

The first step was to select a subset of useful columns, and do some early data processing to have a reasonable data set.



In [4]:

    
# Parsing date columns allows us to do some nifty things with pandas
date_columns = ['po_closed_date', 'creation_date']



In [5]:

    
data = pd.read_csv(data_path, parse_dates=date_columns)



In [6]:

    
data['cost_total'] = data.unit_price * data.quantity

Time analysis



In [7]:

    
groupby = ['creation_date', 'supplier_name']
# groupby = ['creation_date', 'department_name']
# groupby = ['creation_date', 'manufacturer']

# week_counts = data.groupby(groupby).count() # Number of POs per department
week_counts = data.groupby(groupby).mean() # Mean/med cost per department

week_counts = week_counts['cost_total'].unstack(groupby[-1])\
    .resample('w', how='sum')
week_counts['week'] = week_counts.index.week



In [8]:

    
week_sums = week_counts.groupby('week').sum().replace(np.nan, 0)
week_mean = week_sums.mean(1)
week_ste = week_sums.std(1) / np.sqrt((week_sums != 0).sum(1))



In [9]:

    
ax = week_mean.plot(figsize=(10, 5))
ax.fill_between(week_ste.index, week_mean-week_ste, week_mean+week_ste,
                alpha=.2)
sns.regplot(week_mean.index.values, week_mean, ax=ax)









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x1189608d0>



In [10]:

    
def sort_column(arr):
    """Sort a column by its values."""
    ix_sort = np.argsort(arr.values)
    return arr.iloc[ix_sort[::-1]].values



In [11]:

    
# Similarities in time for depts
null_cut = 2 # Keep depts with less than this # null counts
keep_cols = (week_sums == 0).sum(0) < null_cut
week_sums_active = week_sums.loc[:, keep_cols]
week_sums_normalized = week_sums_active / week_sums_active.sum(0)
week_sums_normalized_ordered = week_sums_normalized.apply(sort_column)



In [29]:

    
ax = week_sums_normalized.plot(legend=False, c='k', alpha=.1, figsize=(30, 10))
ax.figure.patch.set_color('w')
ax.set_ylabel('% of purchases')









    Out[29]:





<matplotlib.text.Text at 0x11b693790>



In [ ]:



In [ ]:

    
order



In [64]:

    
f, ax = plt.subplots(figsize=(15, 5))
plt_weeks = 20
orders = np.arange(plt_weeks)[::-1]
for i, ((col, row), order) in enumerate(zip(week_sums_normalized.iloc[:, :plt_weeks].iteritems(), orders)):
    row.mul(25).add(i).plot(ax=ax, c='w')
    ax.lines[-1].set_zorder(order)
    fill = ax.fill_between(row.index, 0, row.mul(25).add(i), facecolor='k')
    fill.set_zorder(order)
ax.figure.patch.set_color('k')
ax.grid(False)



In [76]:

    
f, ax = plt.subplots(figsize=(10, 10))
ax.imshow(week_sums_normalized.T, cmap=plt.cm.RdBu_r,
          aspect='auto', interpolation='nearest')









    Out[76]:





<matplotlib.image.AxesImage at 0x11ce56a90>



In [77]:

    
clust = cluster.KMeans(n_clusters=10)
# clust = cluster.AffinityPropagation()
# clust = cluster.SpectralClustering()
# clust = cluster.AgglomerativeClustering(n_clusters=20)



In [78]:

    
keep_weeks = 10
# X = week_sums_normalized.T.values
# X = week_sums_active.T.values
X = week_sums_normalized_ordered.T.values[:, :keep_weeks]
try:
    clust.fit(X)
    labels = clust.predict(X)
except:
    labels = clust.fit_predict(X)



In [79]:

    
labels_df = pd.DataFrame(labels, columns=['label']).astype(str)
labels_df['id'] = 1



In [80]:

    
f, (ax, ax2) = plt.subplots(2, 1, figsize=(15, 10))
col_plt = plt.cm.Reds(np.linspace(0, 1, clust.cluster_centers_.shape[0]))
ax.set_color_cycle(col_plt)
_ = ax.plot(clust.cluster_centers_.T, alpha=.4)

_ = labels_df.groupby('label').count().plot(kind='bar', ax=ax2)



In [81]:

    
sil = metrics.silhouette_score(X, labels)
print sil









    



0.342004767446



In [82]:

    
def inspect_clust(clust, ax, col):
    n_week, n_sig = clust.shape
    clust_mean = clust.mean(1)
    clust_ste = clust.std(1) / np.sqrt(n_sig)

    _ = ax.plot(clust, alpha=.5, color=col(.5))
    _ = ax.plot(clust_mean, color='k')
    _ = ax.fill_between(range(n_week), 
                        clust_mean - clust_ste, clust_mean + clust_ste,
                        alpha=.5, color='k')



In [83]:

    
cols = [plt.cm.Reds, plt.cm.Blues, plt.cm.Greens]



In [84]:

    
label_dict = dict(fontsize=18)
plt_clusts = [1, 3, 6]
f, ax = plt.subplots(figsize=(15, 15))
for plt_clust, col in zip(plt_clusts, cols):
    mask_clust = labels == plt_clust
    X_sub = X[mask_clust, :].T
    names_clust = week_sums_normalized.columns[mask_clust]
    inspect_clust(X_sub, ax, col)
#     ax.set_title(names_clust.values)
ax.set_xlabel('Week (sorted from high to low volume weeks)', **label_dict)
ax.set_ylabel('Percent of total annual purchases', **label_dict)
ax.grid(False)
ax.patch.set_visible(False)
plt.tight_layout()



In [163]:

    
f.savefig('../../results/clustering_summary.pdf')

Correlation matrix



In [92]:

    
week_sums_corr = week_sums_normalized.corr()
# week_sums_corr = week_sums_normalized_ordered.corr()



In [93]:

    
clust = sns.clustermap(week_sums_corr, figsize=(15, 15))



In [ ]:



In [ ]:



In [ ]: