In [12]:
import pandas as pd
import numpy as np
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import dendrogram, linkage
#from sklearn.cluster import DBSCAN
#from sklearn import metrics
#from scipy.spatial.distance import pdist
In [14]:
%matplotlib inline
np.set_printoptions(precision=5, suppress=True)
In [2]:
cards = pd.read_csv('/mnt/data/mmalik/card_features.csv')
X = cards.iloc[:,13:].as_matrix()
In [3]:
cards.iloc[:,13:].head()
Out[3]:
In [4]:
X
Out[4]:
In [5]:
from scipy.cluster.hierarchy import dendrogram, linkage
In [6]:
X_sample = X[:45000]
X_sample.shape
Out[6]:
In [13]:
Z = linkage(y=X_sample, method='single', metric='jaccard')
In [15]:
Z[0]
Out[15]:
In [16]:
import matplotlib.pyplot as plt
plt.figure()
Out[16]:
In [21]:
import sys
sys.setrecursionlimit(50000)
In [23]:
plt.xlabel('sample index or (cluster size)')
plt.ylabel('distance')
dendrogram(
Z,
truncate_mode='lastp', # show only the last p merged clusters
p=12, # show only the last p merged clusters
leaf_rotation=90.,
leaf_font_size=12.,
show_contracted=True, # to get a distribution impression in truncated branches
)
plt.show()
In [1]:
import scipy as sp
sp.__version__
Out[1]:
In [ ]: