Homework 4:

Follow the steps below to:
- Read wine.csv in the data folder.
- The First Column contains the Wine Category. Don't use it in the models below. We are going to treat it as unsupervized learning and compare the results to the Wine column.
Try KMeans where n_clusters = 3 and compare the clusters to the Wine column.
Try PCA and see how much can you reduce the variable space.
- How many Components did you need to explain 99% of variance in this dataset?
- Plot the PCA variables to see if it brings out the clusters.
Try KMeans and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine column.

Dataset

wine.csv is in data folder under homeworks



In [1]:

    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.datasets as datasets

%matplotlib inline



In [2]:

    
wine=pd.read_csv ('../data/wine.csv')



In [3]:

    
wine.head(5)









    Out[3]:






  
    
      
      Wine
      Alcohol
      Malic.acid
      Ash
      Acl
      Mg
      Phenols
      Flavanoids
      Nonflavanoid.phenols
      Proanth
      Color.int
      Hue
      OD
      Proline
    
  
  
    
      0
       1
       14.23
       1.71
       2.43
       15.6
       127
       2.80
       3.06
       0.28
       2.29
       5.64
       1.04
       3.92
       1065
    
    
      1
       1
       13.20
       1.78
       2.14
       11.2
       100
       2.65
       2.76
       0.26
       1.28
       4.38
       1.05
       3.40
       1050
    
    
      2
       1
       13.16
       2.36
       2.67
       18.6
       101
       2.80
       3.24
       0.30
       2.81
       5.68
       1.03
       3.17
       1185
    
    
      3
       1
       14.37
       1.95
       2.50
       16.8
       113
       3.85
       3.49
       0.24
       2.18
       7.80
       0.86
       3.45
       1480
    
    
      4
       1
       13.24
       2.59
       2.87
       21.0
       118
       2.80
       2.69
       0.39
       1.82
       4.32
       1.04
       2.93
        735



In [4]:

    
wine['Wine'].unique()









    Out[4]:





array([1, 2, 3])

K-Means



In [5]:

    
X = wine.drop('Wine', axis = 1).values
Y=wine['Wine']



In [6]:

    
X









    Out[6]:





array([[  1.42300000e+01,   1.71000000e+00,   2.43000000e+00, ...,
          1.04000000e+00,   3.92000000e+00,   1.06500000e+03],
       [  1.32000000e+01,   1.78000000e+00,   2.14000000e+00, ...,
          1.05000000e+00,   3.40000000e+00,   1.05000000e+03],
       [  1.31600000e+01,   2.36000000e+00,   2.67000000e+00, ...,
          1.03000000e+00,   3.17000000e+00,   1.18500000e+03],
       ..., 
       [  1.32700000e+01,   4.28000000e+00,   2.26000000e+00, ...,
          5.90000000e-01,   1.56000000e+00,   8.35000000e+02],
       [  1.31700000e+01,   2.59000000e+00,   2.37000000e+00, ...,
          6.00000000e-01,   1.62000000e+00,   8.40000000e+02],
       [  1.41300000e+01,   4.10000000e+00,   2.74000000e+00, ...,
          6.10000000e-01,   1.60000000e+00,   5.60000000e+02]])



In [7]:

    
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()



In [8]:

    
X_scaled = scale.fit_transform(X)



In [9]:

    
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, init='random', n_init=10 , max_iter = 300, random_state=1)
Y_hat_kmeans = kmeans.fit(X_scaled).labels_



In [10]:

    
mu = kmeans.cluster_centers_
mu









    Out[10]:





array([[-0.92607185, -0.39404154, -0.49451676,  0.17060184, -0.49171185,
        -0.07598265,  0.02081257, -0.03353357,  0.0582655 , -0.90191402,
         0.46180361,  0.27076419, -0.75384618],
       [ 0.83523208, -0.30380968,  0.36470604, -0.61019129,  0.5775868 ,
         0.88523736,  0.97781956, -0.56208965,  0.58028658,  0.17106348,
         0.47398365,  0.77924711,  1.12518529],
       [ 0.16490746,  0.87154706,  0.18689833,  0.52436746, -0.07547277,
        -0.97933029, -1.21524764,  0.72606354, -0.77970639,  0.94153874,
        -1.16478865, -1.29241163, -0.40708796]])



In [11]:

    
plt.scatter(X_scaled[:,0], X_scaled[:,1], c=Y_hat_kmeans);



In [12]:

    
plt.scatter(X_scaled[:,0], X_scaled[:,1], c=Y);



In [13]:

    
plt.scatter(X_scaled[:,0], X_scaled[:,1], c=Y_hat_kmeans, alpha=0.4)
plt.scatter(mu[:,0], mu[:,1], s=100, c=np.unique(Y_hat_kmeans))
print mu









    



[[-0.92607185 -0.39404154 -0.49451676  0.17060184 -0.49171185 -0.07598265
   0.02081257 -0.03353357  0.0582655  -0.90191402  0.46180361  0.27076419
  -0.75384618]
 [ 0.83523208 -0.30380968  0.36470604 -0.61019129  0.5775868   0.88523736
   0.97781956 -0.56208965  0.58028658  0.17106348  0.47398365  0.77924711
   1.12518529]
 [ 0.16490746  0.87154706  0.18689833  0.52436746 -0.07547277 -0.97933029
  -1.21524764  0.72606354 -0.77970639  0.94153874 -1.16478865 -1.29241163
  -0.40708796]]

PCA



In [14]:

    
from IPython.core.pylabtools import figsize
%matplotlib inline
figsize(12,5)



In [15]:

    
_ = plt.scatter(X_scaled[:,0], X_scaled[:,1])



In [16]:

    
from sklearn.decomposition import PCA
pca = PCA()



In [17]:

    
X_pca = pca.fit_transform(X_scaled)
print X_pca[:10,:]









    



[[ -3.31675081e+00   1.44346263e+00  -1.65739045e-01   2.15631188e-01
    6.93042841e-01   2.23880128e-01   5.96426546e-01  -6.51390947e-02
   -6.41442706e-01   1.02095585e+00   4.51563395e-01   5.40810414e-01
   -6.62386309e-02]
 [ -2.20946492e+00  -3.33392887e-01  -2.02645737e+00   2.91358318e-01
   -2.57654635e-01   9.27120244e-01   5.37756128e-02  -1.02441595e+00
    3.08846753e-01   1.59701372e-01   1.42657306e-01   3.88237741e-01
    3.63650247e-03]
 [ -2.51674015e+00   1.03115130e+00   9.82818670e-01  -7.24902309e-01
   -2.51033118e-01  -5.49276047e-01   4.24205451e-01   3.44216131e-01
    1.17783447e+00   1.13360857e-01   2.86672847e-01   5.83573183e-04
    2.17165104e-02]
 [ -3.75706561e+00   2.75637191e+00  -1.76191842e-01  -5.67983308e-01
   -3.11841591e-01  -1.14431000e-01  -3.83337297e-01  -6.43593498e-01
   -5.25444215e-02   2.39412605e-01  -7.59584312e-01  -2.42019563e-01
   -3.69483531e-01]
 [ -1.00890849e+00   8.69830821e-01   2.02668822e+00   4.09765788e-01
    2.98457503e-01   4.06519601e-01   4.44074463e-01  -4.16700470e-01
   -3.26819165e-01  -7.83664820e-02   5.25945083e-01  -2.16664158e-01
   -7.93635655e-02]
 [ -3.05025392e+00   2.12240111e+00  -6.29395827e-01   5.15637495e-01
   -6.32018734e-01  -1.23430557e-01   4.01653758e-01  -3.94893421e-01
    1.52146076e-01  -1.01995816e-01  -4.05585316e-01  -3.79432684e-01
    1.45155331e-01]
 [ -2.44908967e+00   1.17485013e+00  -9.77094891e-01   6.58305046e-02
   -1.02776191e+00   6.20120743e-01   5.28907285e-02   3.71933862e-01
    4.57015855e-01   1.01656346e+00   4.42433411e-01   1.41229844e-01
   -2.71778184e-01]
 [ -2.05943687e+00   1.60896307e+00   1.46281883e-01   1.19260801e+00
    7.69034938e-02   1.43980622e+00   3.23755923e-02  -2.32978954e-01
   -1.23370316e-01   7.35600047e-01  -2.93554859e-01   3.79663026e-01
   -1.10163787e-01]
 [ -2.51087430e+00   9.18070957e-01  -1.77096903e+00  -5.62703612e-02
   -8.92256977e-01   1.29181048e-01   1.25285071e-01   4.99577904e-01
   -6.06589198e-01   1.74106613e-01   5.08932893e-01  -6.35249336e-01
    1.42083536e-01]
 [ -2.75362819e+00   7.89437674e-01  -9.84247490e-01  -3.49381568e-01
   -4.68553076e-01  -1.63391650e-01  -8.74352245e-01  -1.50579503e-01
   -2.30489152e-01   1.79420103e-01  -1.24781710e-02   5.50326823e-01
   -4.24548533e-02]]



In [18]:

    
pca.components_









    Out[18]:





array([[-0.1443294 ,  0.24518758,  0.00205106,  0.23932041, -0.14199204,
        -0.39466085, -0.4229343 ,  0.2985331 , -0.31342949,  0.0886167 ,
        -0.29671456, -0.37616741, -0.28675223],
       [ 0.48365155,  0.22493093,  0.31606881, -0.0105905 ,  0.299634  ,
         0.06503951, -0.00335981,  0.02877949,  0.03930172,  0.52999567,
        -0.27923515, -0.16449619,  0.36490283],
       [-0.20738262,  0.08901289,  0.6262239 ,  0.61208035,  0.13075693,
         0.14617896,  0.1506819 ,  0.17036816,  0.14945431, -0.13730621,
         0.08522192,  0.16600459, -0.12674592],
       [ 0.0178563 , -0.53689028,  0.21417556, -0.06085941,  0.35179658,
        -0.19806835, -0.15229479,  0.20330102, -0.39905653, -0.06592568,
         0.42777141, -0.18412074,  0.23207086],
       [-0.26566365,  0.03521363, -0.14302547,  0.06610294,  0.72704851,
        -0.14931841, -0.10902584, -0.50070298,  0.13685982, -0.07643678,
        -0.17361452, -0.10116099, -0.1578688 ],
       [ 0.21353865,  0.53681385,  0.15447466, -0.10082451,  0.03814394,
        -0.0841223 , -0.01892002, -0.25859401, -0.53379539, -0.41864414,
         0.10598274,  0.26585107,  0.11972557],
       [-0.05639636,  0.42052391, -0.14917061, -0.28696914,  0.3228833 ,
        -0.02792498, -0.06068521,  0.59544729,  0.37213935, -0.22771214,
         0.23207564, -0.0447637 ,  0.0768045 ],
       [ 0.39613926,  0.06582674, -0.17026002,  0.42797018, -0.15636143,
        -0.40593409, -0.18724536, -0.23328465,  0.36822675, -0.03379692,
         0.43662362, -0.07810789,  0.12002267],
       [-0.50861912,  0.07528304,  0.30769445, -0.20044931, -0.27140257,
        -0.28603452, -0.04957849, -0.19550132,  0.20914487, -0.05621752,
        -0.08582839, -0.1372269 ,  0.57578611],
       [ 0.21160473, -0.30907994, -0.02712539,  0.05279942,  0.06787022,
        -0.32013135, -0.16315051,  0.21553507,  0.1341839 , -0.29077518,
        -0.52239889,  0.52370587,  0.162116  ],
       [ 0.22591696, -0.07648554,  0.49869142, -0.47931378, -0.07128891,
        -0.30434119,  0.02569409, -0.11689586,  0.23736257, -0.0318388 ,
         0.04821201, -0.0464233 , -0.53926983],
       [-0.26628645,  0.12169604, -0.04962237, -0.05574287,  0.06222011,
        -0.30388245, -0.04289883,  0.04235219, -0.09555303,  0.60422163,
         0.259214  ,  0.60095872, -0.07940162],
       [ 0.01496997,  0.02596375, -0.14121803,  0.09168285,  0.05677422,
        -0.46390791,  0.83225706,  0.11403985, -0.11691707, -0.0119928 ,
        -0.08988884, -0.15671813,  0.01444734]])



In [19]:

    
pca.mean_









    Out[19]:





array([  7.84141790e-15,   2.44498554e-16,  -4.05917497e-15,
        -7.11041712e-17,  -2.49488320e-17,  -1.95536471e-16,
         9.44313292e-16,  -4.17892936e-16,  -1.54059038e-15,
        -4.12903170e-16,   1.39838203e-15,   2.12688793e-15,
        -6.98567296e-17])



In [20]:

    
_ = plt.scatter(X_pca[:,0], X_pca[:,1], c=Y_hat_kmeans)



In [21]:

    
#print np.cov(X_pca, rowvar=False)



In [22]:

    
sum(pca.explained_variance_ratio_)









    Out[22]:





0.99999999999999978



In [23]:

    
pca.explained_variance_ratio_









    Out[23]:





array([ 0.36198848,  0.1920749 ,  0.11123631,  0.0706903 ,  0.06563294,
        0.04935823,  0.04238679,  0.02680749,  0.02222153,  0.01930019,
        0.01736836,  0.01298233,  0.00795215])



In [24]:

    
_ = pd.scatter_matrix(wine,diagonal='kde')



In [25]:

    
he_df = pd.DataFrame(X_pca)
_ = pd.scatter_matrix(he_df,diagonal='kde')



In [28]:

    
plt.plot(pca.explained_variance_ratio_);

Hierarchical clustering



In [29]:

    
# compute distance matrix
from scipy.spatial.distance import pdist, squareform

# not printed as pretty, but the values are correct
distx = squareform(pdist(X_pca, metric='euclidean'))
distx









    Out[29]:





array([[ 0.        ,  3.49753522,  3.02660794, ...,  6.4909413 ,
         6.07878091,  7.18442107],
       [ 3.49753522,  0.        ,  4.1429119 , ...,  6.39689969,
         6.09492714,  7.36771922],
       [ 3.02660794,  4.1429119 ,  0.        , ...,  6.25367723,
         5.85179331,  6.35388503],
       ..., 
       [ 6.4909413 ,  6.39689969,  6.25367723, ...,  0.        ,
         1.82621785,  3.39251526],
       [ 6.07878091,  6.09492714,  5.85179331, ...,  1.82621785,
         0.        ,  3.32427633],
       [ 7.18442107,  7.36771922,  6.35388503, ...,  3.39251526,
         3.32427633,  0.        ]])



In [30]:

    
# perform clustering and plot the dendrogram
from scipy.cluster.hierarchy import linkage, dendrogram

R = dendrogram(linkage(distx, method='ward'), color_threshold=10)

plt.xlabel('points')
plt.ylabel('Height')
plt.suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14);



In [ ]:

    
#sklearn.normalize

	Wine	Alcohol	Malic.acid	Ash	Acl	Mg	Phenols	Flavanoids	Nonflavanoid.phenols	Proanth	Color.int	Hue	OD	Proline
0	1	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
2	1	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
3	1	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
4	1	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735