In [116]:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
import sklearn.metrics as sm
 
import pandas as pd
import numpy as np

In [117]:
wine=pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",header=None)

In [118]:
wine.head()


Out[118]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13
0 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
2 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
3 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
4 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735

In [119]:
wine.columns=['winetype','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines','Proline']

In [120]:
wine.head()


Out[120]:
winetype Alcohol Malic acid Ash Alcalinity of ash Magnesium Total phenols Flavanoids Nonflavanoid phenols Proanthocyanins Color intensity Hue OD280/OD315 of diluted wines Proline
0 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
2 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
3 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
4 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735

In [121]:
wine.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
winetype                        178 non-null int64
Alcohol                         178 non-null float64
Malic acid                      178 non-null float64
Ash                             178 non-null float64
Alcalinity of ash               178 non-null float64
Magnesium                       178 non-null int64
Total phenols                   178 non-null float64
Flavanoids                      178 non-null float64
Nonflavanoid phenols            178 non-null float64
Proanthocyanins                 178 non-null float64
Color intensity                 178 non-null float64
Hue                             178 non-null float64
OD280/OD315 of diluted wines    178 non-null float64
Proline                         178 non-null int64
dtypes: float64(11), int64(3)
memory usage: 19.5 KB

In [122]:
wine.describe()


Out[122]:
winetype Alcohol Malic acid Ash Alcalinity of ash Magnesium Total phenols Flavanoids Nonflavanoid phenols Proanthocyanins Color intensity Hue OD280/OD315 of diluted wines Proline
count 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000
mean 1.938202 13.000618 2.336348 2.366517 19.494944 99.741573 2.295112 2.029270 0.361854 1.590899 5.058090 0.957449 2.611685 746.893258
std 0.775035 0.811827 1.117146 0.274344 3.339564 14.282484 0.625851 0.998859 0.124453 0.572359 2.318286 0.228572 0.709990 314.907474
min 1.000000 11.030000 0.740000 1.360000 10.600000 70.000000 0.980000 0.340000 0.130000 0.410000 1.280000 0.480000 1.270000 278.000000
25% 1.000000 12.362500 1.602500 2.210000 17.200000 88.000000 1.742500 1.205000 0.270000 1.250000 3.220000 0.782500 1.937500 500.500000
50% 2.000000 13.050000 1.865000 2.360000 19.500000 98.000000 2.355000 2.135000 0.340000 1.555000 4.690000 0.965000 2.780000 673.500000
75% 3.000000 13.677500 3.082500 2.557500 21.500000 107.000000 2.800000 2.875000 0.437500 1.950000 6.200000 1.120000 3.170000 985.000000
max 3.000000 14.830000 5.800000 3.230000 30.000000 162.000000 3.880000 5.080000 0.660000 3.580000 13.000000 1.710000 4.000000 1680.000000

In [123]:
pd.value_counts(wine['winetype'])


Out[123]:
2    71
1    59
3    48
Name: winetype, dtype: int64

In [124]:
x=wine.iloc[:,1:14]
y=wine.iloc[:,:1]

In [125]:
x.columns


Out[125]:
Index(['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium',
       'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
       'Proanthocyanins', 'Color intensity', 'Hue',
       'OD280/OD315 of diluted wines', 'Proline'],
      dtype='object')

In [126]:
y.columns


Out[126]:
Index(['winetype'], dtype='object')

In [127]:
x.head()


Out[127]:
Alcohol Malic acid Ash Alcalinity of ash Magnesium Total phenols Flavanoids Nonflavanoid phenols Proanthocyanins Color intensity Hue OD280/OD315 of diluted wines Proline
0 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
2 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
3 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
4 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735

In [128]:
y.head()


Out[128]:
winetype
0 1
1 1
2 1
3 1
4 1

In [129]:
# K Means Cluster
model = KMeans(n_clusters=3)
model.fit(x)


Out[129]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [130]:
model.labels_


Out[130]:
array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 2, 1, 2, 2, 1,
       1, 1, 2, 2, 0, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2,
       2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2,
       1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2])

In [131]:
pd.value_counts(y['winetype'])


Out[131]:
2    71
1    59
3    48
Name: winetype, dtype: int64

In [132]:
pd.value_counts(model.labels_)


Out[132]:
2    69
1    62
0    47
dtype: int64

In [171]:
# We convert all the 1s to 0s and 0s to 1s.
predY = np.choose(model.labels_, [1,3,2]).astype(np.int64)

In [172]:
pd.value_counts(y['winetype'])


Out[172]:
2    71
1    59
3    48
Name: winetype, dtype: int64

In [173]:
pd.value_counts(predY)


Out[173]:
2    69
3    62
1    47
dtype: int64

In [174]:
# Performance Metrics
sm.accuracy_score(y, predY)


Out[174]:
0.702247191011236

In [175]:
# Confusion Matrix
sm.confusion_matrix(y, predY)


Out[175]:
array([[46,  0, 13],
       [ 1, 50, 20],
       [ 0, 19, 29]])

In [138]:
!pip install ggplot
#import ggplot as ggplot


Requirement already satisfied: ggplot in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: scipy in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: patsy>=0.4 in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: cycler in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: statsmodels in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: pandas in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: matplotlib in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: brewer2mpl in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: six in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: numpy in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: python-dateutil>=2 in c:\users\kogentix\anaconda3\lib\site-packages (from pandas->ggplot)
Requirement already satisfied: pytz>=2011k in c:\users\kogentix\anaconda3\lib\site-packages (from pandas->ggplot)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=1.5.6 in c:\users\kogentix\anaconda3\lib\site-packages (from matplotlib->ggplot)

In [139]:
from ggplot import *
%matplotlib inline

In [140]:
p = ggplot(aes(x='Alcohol', y='Ash',color="winetype"), data=wine)
p + geom_point()


Out[140]:
<ggplot: (-9223371877061043950)>

In [141]:
p2 = ggplot(aes(x='Alcohol', y='Ash',color="predY"), data=wine)
p2 + geom_point()


Out[141]:
<ggplot: (-9223371877060637994)>

In [142]:
from sklearn import preprocessing

In [143]:
x_scaled = preprocessing.scale(x)

In [144]:
x.head()


Out[144]:
Alcohol Malic acid Ash Alcalinity of ash Magnesium Total phenols Flavanoids Nonflavanoid phenols Proanthocyanins Color intensity Hue OD280/OD315 of diluted wines Proline
0 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
2 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
3 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
4 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735

In [145]:
x_scaled


Out[145]:
array([[ 1.51861254, -0.5622498 ,  0.23205254, ...,  0.36217728,
         1.84791957,  1.01300893],
       [ 0.24628963, -0.49941338, -0.82799632, ...,  0.40605066,
         1.1134493 ,  0.96524152],
       [ 0.19687903,  0.02123125,  1.10933436, ...,  0.31830389,
         0.78858745,  1.39514818],
       ..., 
       [ 0.33275817,  1.74474449, -0.38935541, ..., -1.61212515,
        -1.48544548,  0.28057537],
       [ 0.20923168,  0.22769377,  0.01273209, ..., -1.56825176,
        -1.40069891,  0.29649784],
       [ 1.39508604,  1.58316512,  1.36520822, ..., -1.52437837,
        -1.42894777, -0.59516041]])

In [146]:
x.columns


Out[146]:
Index(['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium',
       'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
       'Proanthocyanins', 'Color intensity', 'Hue',
       'OD280/OD315 of diluted wines', 'Proline'],
      dtype='object')

In [147]:
x.index


Out[147]:
RangeIndex(start=0, stop=178, step=1)

In [148]:
x_scaleddf=pd.DataFrame(data=x_scaled,index=x.index,columns=x.columns)

In [149]:
# K Means Cluster
model2 = KMeans(n_clusters=3)
model2.fit(x_scaleddf)


Out[149]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [150]:
model2.labels_


Out[150]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [151]:
pd.value_counts(model2.labels_)


Out[151]:
2    65
0    62
1    51
dtype: int64

In [152]:
pd.value_counts(y['winetype'])


Out[152]:
2    71
1    59
3    48
Name: winetype, dtype: int64

In [157]:
# We convert all the 1s to 0s and 0s to 1s.
predY2 = np.choose(model2.labels_, [1, 3, 2]).astype(np.int64)

In [158]:
pd.value_counts(predY2)


Out[158]:
2    65
1    62
3    51
dtype: int64

In [159]:
sm.accuracy_score(y, predY2)


Out[159]:
0.9662921348314607

In [160]:
sm.confusion_matrix(y, predY2)


Out[160]:
array([[59,  0,  0],
       [ 3, 65,  3],
       [ 0,  0, 48]])

In [ ]: