https://rstudio-pubs-static.s3.amazonaws.com/33876_1d7794d9a86647ca90c4f182df93f0e8.html and http://nbviewer.jupyter.org/github/OxanaSachenkova/hclust-python/blob/master/hclust.ipynb
In [1]:
import numpy as np
In [2]:
from numpy import genfromtxt
data = genfromtxt('http://www.biz.uiowa.edu/faculty/jledolter/DataMining/protein.csv',delimiter=',',names=True,dtype=float)
note numpy also has recfromcsv() and pandas can read_csv, with pandas DF.values giving a numpy array
In [3]:
len(data)
Out[3]:
In [4]:
len(data.dtype.names)
Out[4]:
In [5]:
data.dtype.names
Out[5]:
In [6]:
type(data)
Out[6]:
In [7]:
data
Out[7]:
In [8]:
data_array = data.view((np.float, len(data.dtype.names)))
In [9]:
data_array
Out[9]:
In [10]:
data_array = data_array.transpose()
In [11]:
print(data_array)
In [12]:
data_array[1:10]
Out[12]:
In [13]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
In [14]:
data_dist = pdist(data_array[1:10]) # computing the distance
data_link = linkage(data_dist) # computing the linkage
In [15]:
dendrogram(data_link,labels=data.dtype.names)
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.suptitle('Samples clustering', fontweight='bold', fontsize=14);
In [16]:
plt.show()
In [17]:
# Compute and plot first dendrogram.
fig = plt.figure(figsize=(8,8))
# x ywidth height
ax1 = fig.add_axes([0.05,0.1,0.2,0.6])
Y = linkage(data_dist, method='single')
Z1 = dendrogram(Y, orientation='right',labels=data.dtype.names) # adding/removing the axes
ax1.set_xticks([])
# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
Z2 = dendrogram(Y)
ax2.set_xticks([])
ax2.set_yticks([])
#Compute and plot the heatmap
axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = squareform(data_dist)
D = D[idx1,:]
D = D[:,idx2]
im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])
# Plot colorbar.
axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
plt.colorbar(im, cax=axcolor)
Out[17]:
In [18]:
plt.show()
In [19]:
! pip install fastcluster
In [20]:
from fastcluster import *
%timeit data_link = linkage(data_array[1:10], method='single', metric='euclidean', preserve_input=True)
dendrogram(data_link,labels=data.dtype.names)
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.suptitle('Samples clustering', fontweight='bold', fontsize=14);
plt.show()
In [ ]: