In [12]:
import pandas as pd
import numpy as np
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import dendrogram, linkage
#from sklearn.cluster import DBSCAN
#from sklearn import metrics
#from scipy.spatial.distance import pdist

In [14]:
%matplotlib inline
np.set_printoptions(precision=5, suppress=True)

In [2]:
cards = pd.read_csv('/mnt/data/mmalik/card_features.csv')
X = cards.iloc[:,13:].as_matrix()

In [3]:
cards.iloc[:,13:].head()


Out[3]:
visited_museum_1 visited_museum_2 visited_museum_3 visited_museum_4 visited_museum_5 visited_museum_6 visited_museum_7 visited_museum_8 visited_museum_9 visited_museum_10 ... visited_museum_29 visited_museum_30 visited_museum_31 visited_museum_32 visited_museum_33 visited_museum_34 visited_museum_35 visited_museum_36 visited_museum_37 visited_museum_38
0 1 0 1 0 0 0 0 0 0 1 ... 1 0 0 1 0 0 0 0 0 0
1 0 1 1 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 1
2 0 1 1 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 1
3 1 0 1 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 1 1 0 1 0 1 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 38 columns


In [4]:
X


Out[4]:
array([[1, 0, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 1],
       [0, 1, 1, ..., 0, 0, 1],
       ..., 
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [5]:
from scipy.cluster.hierarchy import dendrogram, linkage

In [6]:
X_sample = X[:45000]
X_sample.shape


Out[6]:
(45000, 38)

In [13]:
Z = linkage(y=X_sample, method='single', metric='jaccard')

In [15]:
Z[0]


Out[15]:
array([    0.,  7115.,     0.,     2.])

In [16]:
import matplotlib.pyplot as plt
plt.figure()


Out[16]:
<matplotlib.figure.Figure at 0x7f93edc75110>
<matplotlib.figure.Figure at 0x7f93edc75110>

In [21]:
import sys
sys.setrecursionlimit(50000)

In [23]:
plt.xlabel('sample index or (cluster size)')
plt.ylabel('distance')
dendrogram(
    Z,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=12,  # show only the last p merged clusters
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,  # to get a distribution impression in truncated branches
)
plt.show()



In [1]:
import scipy as sp
sp.__version__


Out[1]:
'0.18.0'

In [ ]: