Homework 4:

  1. Follow the steps below to:
    • Read wine.csv in the data folder.
    • The First Column contains the Wine Category. Don't use it in the models below. We are going to treat it as unsupervized learning and compare the results to the Wine column.
  2. Try KMeans where n_clusters = 3 and compare the clusters to the Wine column.
  3. Try PCA and see how much can you reduce the variable space.
    • How many Components did you need to explain 99% of variance in this dataset?
    • Plot the PCA variables to see if it brings out the clusters.
  4. Try KMeans and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine column.

Dataset

wine.csv is in data folder under homeworks


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.datasets as datasets

%matplotlib inline

In [2]:
wine=pd.read_csv ('../data/wine.csv')

In [3]:
wine.head(5)


Out[3]:
Wine Alcohol Malic.acid Ash Acl Mg Phenols Flavanoids Nonflavanoid.phenols Proanth Color.int Hue OD Proline
0 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
2 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
3 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
4 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735

In [4]:
wine['Wine'].unique()


Out[4]:
array([1, 2, 3])

K-Means


In [5]:
X = wine.drop('Wine', axis = 1).values
Y=wine['Wine']

In [6]:
X


Out[6]:
array([[  1.42300000e+01,   1.71000000e+00,   2.43000000e+00, ...,
          1.04000000e+00,   3.92000000e+00,   1.06500000e+03],
       [  1.32000000e+01,   1.78000000e+00,   2.14000000e+00, ...,
          1.05000000e+00,   3.40000000e+00,   1.05000000e+03],
       [  1.31600000e+01,   2.36000000e+00,   2.67000000e+00, ...,
          1.03000000e+00,   3.17000000e+00,   1.18500000e+03],
       ..., 
       [  1.32700000e+01,   4.28000000e+00,   2.26000000e+00, ...,
          5.90000000e-01,   1.56000000e+00,   8.35000000e+02],
       [  1.31700000e+01,   2.59000000e+00,   2.37000000e+00, ...,
          6.00000000e-01,   1.62000000e+00,   8.40000000e+02],
       [  1.41300000e+01,   4.10000000e+00,   2.74000000e+00, ...,
          6.10000000e-01,   1.60000000e+00,   5.60000000e+02]])

In [7]:
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()

In [8]:
X_scaled = scale.fit_transform(X)

In [9]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, init='random', n_init=10 , max_iter = 300, random_state=1)
Y_hat_kmeans = kmeans.fit(X_scaled).labels_

In [10]:
mu = kmeans.cluster_centers_
mu


Out[10]:
array([[-0.92607185, -0.39404154, -0.49451676,  0.17060184, -0.49171185,
        -0.07598265,  0.02081257, -0.03353357,  0.0582655 , -0.90191402,
         0.46180361,  0.27076419, -0.75384618],
       [ 0.83523208, -0.30380968,  0.36470604, -0.61019129,  0.5775868 ,
         0.88523736,  0.97781956, -0.56208965,  0.58028658,  0.17106348,
         0.47398365,  0.77924711,  1.12518529],
       [ 0.16490746,  0.87154706,  0.18689833,  0.52436746, -0.07547277,
        -0.97933029, -1.21524764,  0.72606354, -0.77970639,  0.94153874,
        -1.16478865, -1.29241163, -0.40708796]])

In [11]:
plt.scatter(X_scaled[:,0], X_scaled[:,1], c=Y_hat_kmeans);



In [12]:
plt.scatter(X_scaled[:,0], X_scaled[:,1], c=Y);



In [13]:
plt.scatter(X_scaled[:,0], X_scaled[:,1], c=Y_hat_kmeans, alpha=0.4)
plt.scatter(mu[:,0], mu[:,1], s=100, c=np.unique(Y_hat_kmeans))
print mu


[[-0.92607185 -0.39404154 -0.49451676  0.17060184 -0.49171185 -0.07598265
   0.02081257 -0.03353357  0.0582655  -0.90191402  0.46180361  0.27076419
  -0.75384618]
 [ 0.83523208 -0.30380968  0.36470604 -0.61019129  0.5775868   0.88523736
   0.97781956 -0.56208965  0.58028658  0.17106348  0.47398365  0.77924711
   1.12518529]
 [ 0.16490746  0.87154706  0.18689833  0.52436746 -0.07547277 -0.97933029
  -1.21524764  0.72606354 -0.77970639  0.94153874 -1.16478865 -1.29241163
  -0.40708796]]

PCA


In [14]:
from IPython.core.pylabtools import figsize
%matplotlib inline
figsize(12,5)

In [15]:
_ = plt.scatter(X_scaled[:,0], X_scaled[:,1])



In [16]:
from sklearn.decomposition import PCA
pca = PCA()

In [17]:
X_pca = pca.fit_transform(X_scaled)
print X_pca[:10,:]


[[ -3.31675081e+00   1.44346263e+00  -1.65739045e-01   2.15631188e-01
    6.93042841e-01   2.23880128e-01   5.96426546e-01  -6.51390947e-02
   -6.41442706e-01   1.02095585e+00   4.51563395e-01   5.40810414e-01
   -6.62386309e-02]
 [ -2.20946492e+00  -3.33392887e-01  -2.02645737e+00   2.91358318e-01
   -2.57654635e-01   9.27120244e-01   5.37756128e-02  -1.02441595e+00
    3.08846753e-01   1.59701372e-01   1.42657306e-01   3.88237741e-01
    3.63650247e-03]
 [ -2.51674015e+00   1.03115130e+00   9.82818670e-01  -7.24902309e-01
   -2.51033118e-01  -5.49276047e-01   4.24205451e-01   3.44216131e-01
    1.17783447e+00   1.13360857e-01   2.86672847e-01   5.83573183e-04
    2.17165104e-02]
 [ -3.75706561e+00   2.75637191e+00  -1.76191842e-01  -5.67983308e-01
   -3.11841591e-01  -1.14431000e-01  -3.83337297e-01  -6.43593498e-01
   -5.25444215e-02   2.39412605e-01  -7.59584312e-01  -2.42019563e-01
   -3.69483531e-01]
 [ -1.00890849e+00   8.69830821e-01   2.02668822e+00   4.09765788e-01
    2.98457503e-01   4.06519601e-01   4.44074463e-01  -4.16700470e-01
   -3.26819165e-01  -7.83664820e-02   5.25945083e-01  -2.16664158e-01
   -7.93635655e-02]
 [ -3.05025392e+00   2.12240111e+00  -6.29395827e-01   5.15637495e-01
   -6.32018734e-01  -1.23430557e-01   4.01653758e-01  -3.94893421e-01
    1.52146076e-01  -1.01995816e-01  -4.05585316e-01  -3.79432684e-01
    1.45155331e-01]
 [ -2.44908967e+00   1.17485013e+00  -9.77094891e-01   6.58305046e-02
   -1.02776191e+00   6.20120743e-01   5.28907285e-02   3.71933862e-01
    4.57015855e-01   1.01656346e+00   4.42433411e-01   1.41229844e-01
   -2.71778184e-01]
 [ -2.05943687e+00   1.60896307e+00   1.46281883e-01   1.19260801e+00
    7.69034938e-02   1.43980622e+00   3.23755923e-02  -2.32978954e-01
   -1.23370316e-01   7.35600047e-01  -2.93554859e-01   3.79663026e-01
   -1.10163787e-01]
 [ -2.51087430e+00   9.18070957e-01  -1.77096903e+00  -5.62703612e-02
   -8.92256977e-01   1.29181048e-01   1.25285071e-01   4.99577904e-01
   -6.06589198e-01   1.74106613e-01   5.08932893e-01  -6.35249336e-01
    1.42083536e-01]
 [ -2.75362819e+00   7.89437674e-01  -9.84247490e-01  -3.49381568e-01
   -4.68553076e-01  -1.63391650e-01  -8.74352245e-01  -1.50579503e-01
   -2.30489152e-01   1.79420103e-01  -1.24781710e-02   5.50326823e-01
   -4.24548533e-02]]

In [18]:
pca.components_


Out[18]:
array([[-0.1443294 ,  0.24518758,  0.00205106,  0.23932041, -0.14199204,
        -0.39466085, -0.4229343 ,  0.2985331 , -0.31342949,  0.0886167 ,
        -0.29671456, -0.37616741, -0.28675223],
       [ 0.48365155,  0.22493093,  0.31606881, -0.0105905 ,  0.299634  ,
         0.06503951, -0.00335981,  0.02877949,  0.03930172,  0.52999567,
        -0.27923515, -0.16449619,  0.36490283],
       [-0.20738262,  0.08901289,  0.6262239 ,  0.61208035,  0.13075693,
         0.14617896,  0.1506819 ,  0.17036816,  0.14945431, -0.13730621,
         0.08522192,  0.16600459, -0.12674592],
       [ 0.0178563 , -0.53689028,  0.21417556, -0.06085941,  0.35179658,
        -0.19806835, -0.15229479,  0.20330102, -0.39905653, -0.06592568,
         0.42777141, -0.18412074,  0.23207086],
       [-0.26566365,  0.03521363, -0.14302547,  0.06610294,  0.72704851,
        -0.14931841, -0.10902584, -0.50070298,  0.13685982, -0.07643678,
        -0.17361452, -0.10116099, -0.1578688 ],
       [ 0.21353865,  0.53681385,  0.15447466, -0.10082451,  0.03814394,
        -0.0841223 , -0.01892002, -0.25859401, -0.53379539, -0.41864414,
         0.10598274,  0.26585107,  0.11972557],
       [-0.05639636,  0.42052391, -0.14917061, -0.28696914,  0.3228833 ,
        -0.02792498, -0.06068521,  0.59544729,  0.37213935, -0.22771214,
         0.23207564, -0.0447637 ,  0.0768045 ],
       [ 0.39613926,  0.06582674, -0.17026002,  0.42797018, -0.15636143,
        -0.40593409, -0.18724536, -0.23328465,  0.36822675, -0.03379692,
         0.43662362, -0.07810789,  0.12002267],
       [-0.50861912,  0.07528304,  0.30769445, -0.20044931, -0.27140257,
        -0.28603452, -0.04957849, -0.19550132,  0.20914487, -0.05621752,
        -0.08582839, -0.1372269 ,  0.57578611],
       [ 0.21160473, -0.30907994, -0.02712539,  0.05279942,  0.06787022,
        -0.32013135, -0.16315051,  0.21553507,  0.1341839 , -0.29077518,
        -0.52239889,  0.52370587,  0.162116  ],
       [ 0.22591696, -0.07648554,  0.49869142, -0.47931378, -0.07128891,
        -0.30434119,  0.02569409, -0.11689586,  0.23736257, -0.0318388 ,
         0.04821201, -0.0464233 , -0.53926983],
       [-0.26628645,  0.12169604, -0.04962237, -0.05574287,  0.06222011,
        -0.30388245, -0.04289883,  0.04235219, -0.09555303,  0.60422163,
         0.259214  ,  0.60095872, -0.07940162],
       [ 0.01496997,  0.02596375, -0.14121803,  0.09168285,  0.05677422,
        -0.46390791,  0.83225706,  0.11403985, -0.11691707, -0.0119928 ,
        -0.08988884, -0.15671813,  0.01444734]])

In [19]:
pca.mean_


Out[19]:
array([  7.84141790e-15,   2.44498554e-16,  -4.05917497e-15,
        -7.11041712e-17,  -2.49488320e-17,  -1.95536471e-16,
         9.44313292e-16,  -4.17892936e-16,  -1.54059038e-15,
        -4.12903170e-16,   1.39838203e-15,   2.12688793e-15,
        -6.98567296e-17])

In [20]:
_ = plt.scatter(X_pca[:,0], X_pca[:,1], c=Y_hat_kmeans)



In [21]:
#print np.cov(X_pca, rowvar=False)

In [22]:
sum(pca.explained_variance_ratio_)


Out[22]:
0.99999999999999978

In [23]:
pca.explained_variance_ratio_


Out[23]:
array([ 0.36198848,  0.1920749 ,  0.11123631,  0.0706903 ,  0.06563294,
        0.04935823,  0.04238679,  0.02680749,  0.02222153,  0.01930019,
        0.01736836,  0.01298233,  0.00795215])

In [24]:
_ = pd.scatter_matrix(wine,diagonal='kde')



In [25]:
he_df = pd.DataFrame(X_pca)
_ = pd.scatter_matrix(he_df,diagonal='kde')



In [28]:
plt.plot(pca.explained_variance_ratio_);


Hierarchical clustering


In [29]:
# compute distance matrix
from scipy.spatial.distance import pdist, squareform

# not printed as pretty, but the values are correct
distx = squareform(pdist(X_pca, metric='euclidean'))
distx


Out[29]:
array([[ 0.        ,  3.49753522,  3.02660794, ...,  6.4909413 ,
         6.07878091,  7.18442107],
       [ 3.49753522,  0.        ,  4.1429119 , ...,  6.39689969,
         6.09492714,  7.36771922],
       [ 3.02660794,  4.1429119 ,  0.        , ...,  6.25367723,
         5.85179331,  6.35388503],
       ..., 
       [ 6.4909413 ,  6.39689969,  6.25367723, ...,  0.        ,
         1.82621785,  3.39251526],
       [ 6.07878091,  6.09492714,  5.85179331, ...,  1.82621785,
         0.        ,  3.32427633],
       [ 7.18442107,  7.36771922,  6.35388503, ...,  3.39251526,
         3.32427633,  0.        ]])

In [30]:
# perform clustering and plot the dendrogram
from scipy.cluster.hierarchy import linkage, dendrogram

R = dendrogram(linkage(distx, method='ward'), color_threshold=10)

plt.xlabel('points')
plt.ylabel('Height')
plt.suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14);



In [ ]:
#sklearn.normalize