Chapter10
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
usarrests = '../data/USArrests.csv'
usarrests = pd.read_csv(usarrests, index_col=0)
usarrests.head()
Out[2]:
In [11]:
from sklearn.preprocessing import scale
X = usarrests[['Murder','Assault','UrbanPop','Rape']]
X_scaled = scale(X)
print(X_scaled.mean(axis=0))
print(X_scaled.std(axis=0))
df = pd.DataFrame({
'Murder_scaled':X_scaled[:,0],
'Assault_scaled':X_scaled[:,1],
'UrbanPop_scaled':X_scaled[:,2],
'Rape_scaled':X_scaled[:,3]
})
df = df.T
df.head()
Out[11]:
In [14]:
rij = df.corr()
rij = 1-rij
Out[14]:
In [22]:
x = []
y = []
for i in range(50):
for j in range(50):
x.append(rij[i][j])
instance_i = df[i].values
instacne_j = df[j].values
y.append(np.sqrt(np.sum((instance_i-instacne_j)*(instance_i-instacne_j))))
plt.scatter(x,y)
Out[22]:
In [24]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
X = usarrests[['Murder','Assault','UrbanPop','Rape']].values
pca.fit(X)
print(pca.explained_variance_ratio_)
In [27]:
usarrests = '../data/USArrests.csv'
usarrests = pd.read_csv(usarrests, index_col=0)
usarrests.head()
X = usarrests[['Murder','Assault','UrbanPop','Rape']]
y = usarrests.index.values
In [29]:
import scipy.cluster.hierarchy as sch
Y = sch.linkage(X, method='ward')
_, ax = plt.subplots(figsize=(10, 40))
Z = sch.dendrogram(Y, orientation='right')
idx = Z['leaves']
ax.set_xticks([])
ax.set_yticklabels(y[idx])
ax.set_frame_on(False)
plt.show()
In [32]:
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=3)
cluster.fit(X)
labels = cluster.labels_
for label, state in zip(labels, y):
print(label,':',state)
In [35]:
from sklearn.preprocessing import scale
X_scale = scale(X)
cluster = AgglomerativeClustering(n_clusters=3)
cluster.fit(X_scale)
labels = cluster.labels_
for label, state in zip(labels, y):
print(state,":", label)
In [55]:
X = np.random.normal(0,0.1,(60,50))
X[0:20,1] = 1
X[20:40,0] = 2
X[20:40,1] = 2
X[40:,0] = 1
In [56]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
X_trains = pca.transform(X)
plt.scatter(X_trains[:20,0], X_trains[:20,1], c='g', marker='+')
plt.scatter(X_trains[20:40,0], X_trains[20:40,1] , c='b', marker='*')
plt.scatter(X_trains[40:,0], X_trains[40:,1], c='r', marker='H')
plt.show()
In [58]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
labels = kmeans.labels_
for idx, label in enumerate(labels):
print('the '+str(idx+1)+'th sample label is ', label)
In [59]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
labels = kmeans.labels_
for idx, label in enumerate(labels):
print('the '+str(idx+1)+'th sample label is ', label)
In [60]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
labels = kmeans.labels_
for idx, label in enumerate(labels):
print('the '+str(idx+1)+'th sample label is ', label)
In [62]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_trains)
labels = kmeans.labels_
for idx, label in enumerate(labels):
print('the '+str(idx+1)+'th sample label is ', label)
In [64]:
X_scaled = scale(X)
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_scaled)
for idx, label in enumerate(labels):
print('the '+str(idx+1)+'th sample label is ', label)
In [3]:
data_file_path = '../data/Ch10Ex11.csv'
data = pd.read_csv(data_file_path, header=None)
data.head()
Out[3]:
In [4]:
import scipy.cluster.hierarchy as sch
X = data.values
y = np.array(['recover']*1000)
y[20:980]='unknown'
y[980:]='disease'
Y = sch.linkage(X, method='ward')
_, ax = plt.subplots(figsize=(100, 400))
Z = sch.dendrogram(Y, orientation='right')
idx = Z['leaves']
ax.set_xticks([])
ax.set_yticklabels(y[idx])
ax.set_frame_on(False)
plt.show()
In [ ]: