notebook.community

Edit and run



In [116]:

    
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
import sklearn.metrics as sm
 
import pandas as pd
import numpy as np



In [117]:

    
wine=pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",header=None)



In [118]:

    
wine.head()



In [119]:

    
wine.columns=['winetype','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines','Proline']



In [120]:

    
wine.head()









    Out[120]:







  
    
      
      winetype
      Alcohol
      Malic acid
      Ash
      Alcalinity of ash
      Magnesium
      Total phenols
      Flavanoids
      Nonflavanoid phenols
      Proanthocyanins
      Color intensity
      Hue
      OD280/OD315 of diluted wines
      Proline
    
  
  
    
      0
      1
      14.23
      1.71
      2.43
      15.6
      127
      2.80
      3.06
      0.28
      2.29
      5.64
      1.04
      3.92
      1065
    
    
      1
      1
      13.20
      1.78
      2.14
      11.2
      100
      2.65
      2.76
      0.26
      1.28
      4.38
      1.05
      3.40
      1050
    
    
      2
      1
      13.16
      2.36
      2.67
      18.6
      101
      2.80
      3.24
      0.30
      2.81
      5.68
      1.03
      3.17
      1185
    
    
      3
      1
      14.37
      1.95
      2.50
      16.8
      113
      3.85
      3.49
      0.24
      2.18
      7.80
      0.86
      3.45
      1480
    
    
      4
      1
      13.24
      2.59
      2.87
      21.0
      118
      2.80
      2.69
      0.39
      1.82
      4.32
      1.04
      2.93
      735



In [121]:

    
wine.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
winetype                        178 non-null int64
Alcohol                         178 non-null float64
Malic acid                      178 non-null float64
Ash                             178 non-null float64
Alcalinity of ash               178 non-null float64
Magnesium                       178 non-null int64
Total phenols                   178 non-null float64
Flavanoids                      178 non-null float64
Nonflavanoid phenols            178 non-null float64
Proanthocyanins                 178 non-null float64
Color intensity                 178 non-null float64
Hue                             178 non-null float64
OD280/OD315 of diluted wines    178 non-null float64
Proline                         178 non-null int64
dtypes: float64(11), int64(3)
memory usage: 19.5 KB



In [122]:

    
wine.describe()









    Out[122]:







  
    
      
      winetype
      Alcohol
      Malic acid
      Ash
      Alcalinity of ash
      Magnesium
      Total phenols
      Flavanoids
      Nonflavanoid phenols
      Proanthocyanins
      Color intensity
      Hue
      OD280/OD315 of diluted wines
      Proline
    
  
  
    
      count
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
    
    
      mean
      1.938202
      13.000618
      2.336348
      2.366517
      19.494944
      99.741573
      2.295112
      2.029270
      0.361854
      1.590899
      5.058090
      0.957449
      2.611685
      746.893258
    
    
      std
      0.775035
      0.811827
      1.117146
      0.274344
      3.339564
      14.282484
      0.625851
      0.998859
      0.124453
      0.572359
      2.318286
      0.228572
      0.709990
      314.907474
    
    
      min
      1.000000
      11.030000
      0.740000
      1.360000
      10.600000
      70.000000
      0.980000
      0.340000
      0.130000
      0.410000
      1.280000
      0.480000
      1.270000
      278.000000
    
    
      25%
      1.000000
      12.362500
      1.602500
      2.210000
      17.200000
      88.000000
      1.742500
      1.205000
      0.270000
      1.250000
      3.220000
      0.782500
      1.937500
      500.500000
    
    
      50%
      2.000000
      13.050000
      1.865000
      2.360000
      19.500000
      98.000000
      2.355000
      2.135000
      0.340000
      1.555000
      4.690000
      0.965000
      2.780000
      673.500000
    
    
      75%
      3.000000
      13.677500
      3.082500
      2.557500
      21.500000
      107.000000
      2.800000
      2.875000
      0.437500
      1.950000
      6.200000
      1.120000
      3.170000
      985.000000
    
    
      max
      3.000000
      14.830000
      5.800000
      3.230000
      30.000000
      162.000000
      3.880000
      5.080000
      0.660000
      3.580000
      13.000000
      1.710000
      4.000000
      1680.000000



In [123]:

    
pd.value_counts(wine['winetype'])









    Out[123]:





2    71
1    59
3    48
Name: winetype, dtype: int64



In [124]:

    
x=wine.iloc[:,1:14]
y=wine.iloc[:,:1]



In [125]:

    
x.columns









    Out[125]:





Index(['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium',
       'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
       'Proanthocyanins', 'Color intensity', 'Hue',
       'OD280/OD315 of diluted wines', 'Proline'],
      dtype='object')



In [126]:

    
y.columns









    Out[126]:





Index(['winetype'], dtype='object')



In [127]:

    
x.head()









    Out[127]:







  
    
      
      Alcohol
      Malic acid
      Ash
      Alcalinity of ash
      Magnesium
      Total phenols
      Flavanoids
      Nonflavanoid phenols
      Proanthocyanins
      Color intensity
      Hue
      OD280/OD315 of diluted wines
      Proline
    
  
  
    
      0
      14.23
      1.71
      2.43
      15.6
      127
      2.80
      3.06
      0.28
      2.29
      5.64
      1.04
      3.92
      1065
    
    
      1
      13.20
      1.78
      2.14
      11.2
      100
      2.65
      2.76
      0.26
      1.28
      4.38
      1.05
      3.40
      1050
    
    
      2
      13.16
      2.36
      2.67
      18.6
      101
      2.80
      3.24
      0.30
      2.81
      5.68
      1.03
      3.17
      1185
    
    
      3
      14.37
      1.95
      2.50
      16.8
      113
      3.85
      3.49
      0.24
      2.18
      7.80
      0.86
      3.45
      1480
    
    
      4
      13.24
      2.59
      2.87
      21.0
      118
      2.80
      2.69
      0.39
      1.82
      4.32
      1.04
      2.93
      735



In [128]:

    
y.head()



In [129]:

    
# K Means Cluster
model = KMeans(n_clusters=3)
model.fit(x)









    Out[129]:





KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)



In [130]:

    
model.labels_









    Out[130]:





array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 2, 1, 2, 2, 1,
       1, 1, 2, 2, 0, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2,
       2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2,
       1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2])



In [131]:

    
pd.value_counts(y['winetype'])









    Out[131]:





2    71
1    59
3    48
Name: winetype, dtype: int64



In [132]:

    
pd.value_counts(model.labels_)









    Out[132]:





2    69
1    62
0    47
dtype: int64



In [171]:

    
# We convert all the 1s to 0s and 0s to 1s.
predY = np.choose(model.labels_, [1,3,2]).astype(np.int64)



In [172]:

    
pd.value_counts(y['winetype'])









    Out[172]:





2    71
1    59
3    48
Name: winetype, dtype: int64



In [173]:

    
pd.value_counts(predY)









    Out[173]:





2    69
3    62
1    47
dtype: int64



In [174]:

    
# Performance Metrics
sm.accuracy_score(y, predY)









    Out[174]:





0.702247191011236



In [175]:

    
# Confusion Matrix
sm.confusion_matrix(y, predY)









    Out[175]:





array([[46,  0, 13],
       [ 1, 50, 20],
       [ 0, 19, 29]])



In [138]:

    
!pip install ggplot
#import ggplot as ggplot









    



Requirement already satisfied: ggplot in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: scipy in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: patsy>=0.4 in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: cycler in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: statsmodels in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: pandas in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: matplotlib in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: brewer2mpl in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: six in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: numpy in c:\users\kogentix\anaconda3\lib\site-packages (from ggplot)
Requirement already satisfied: python-dateutil>=2 in c:\users\kogentix\anaconda3\lib\site-packages (from pandas->ggplot)
Requirement already satisfied: pytz>=2011k in c:\users\kogentix\anaconda3\lib\site-packages (from pandas->ggplot)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=1.5.6 in c:\users\kogentix\anaconda3\lib\site-packages (from matplotlib->ggplot)



In [139]:

    
from ggplot import *
%matplotlib inline



In [140]:

    
p = ggplot(aes(x='Alcohol', y='Ash',color="winetype"), data=wine)
p + geom_point()









    












    Out[140]:





<ggplot: (-9223371877061043950)>



In [141]:

    
p2 = ggplot(aes(x='Alcohol', y='Ash',color="predY"), data=wine)
p2 + geom_point()









    












    Out[141]:





<ggplot: (-9223371877060637994)>



In [142]:

    
from sklearn import preprocessing



In [143]:

    
x_scaled = preprocessing.scale(x)



In [144]:

    
x.head()









    Out[144]:







  
    
      
      Alcohol
      Malic acid
      Ash
      Alcalinity of ash
      Magnesium
      Total phenols
      Flavanoids
      Nonflavanoid phenols
      Proanthocyanins
      Color intensity
      Hue
      OD280/OD315 of diluted wines
      Proline
    
  
  
    
      0
      14.23
      1.71
      2.43
      15.6
      127
      2.80
      3.06
      0.28
      2.29
      5.64
      1.04
      3.92
      1065
    
    
      1
      13.20
      1.78
      2.14
      11.2
      100
      2.65
      2.76
      0.26
      1.28
      4.38
      1.05
      3.40
      1050
    
    
      2
      13.16
      2.36
      2.67
      18.6
      101
      2.80
      3.24
      0.30
      2.81
      5.68
      1.03
      3.17
      1185
    
    
      3
      14.37
      1.95
      2.50
      16.8
      113
      3.85
      3.49
      0.24
      2.18
      7.80
      0.86
      3.45
      1480
    
    
      4
      13.24
      2.59
      2.87
      21.0
      118
      2.80
      2.69
      0.39
      1.82
      4.32
      1.04
      2.93
      735



In [145]:

    
x_scaled









    Out[145]:





array([[ 1.51861254, -0.5622498 ,  0.23205254, ...,  0.36217728,
         1.84791957,  1.01300893],
       [ 0.24628963, -0.49941338, -0.82799632, ...,  0.40605066,
         1.1134493 ,  0.96524152],
       [ 0.19687903,  0.02123125,  1.10933436, ...,  0.31830389,
         0.78858745,  1.39514818],
       ..., 
       [ 0.33275817,  1.74474449, -0.38935541, ..., -1.61212515,
        -1.48544548,  0.28057537],
       [ 0.20923168,  0.22769377,  0.01273209, ..., -1.56825176,
        -1.40069891,  0.29649784],
       [ 1.39508604,  1.58316512,  1.36520822, ..., -1.52437837,
        -1.42894777, -0.59516041]])



In [146]:

    
x.columns









    Out[146]:





Index(['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium',
       'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
       'Proanthocyanins', 'Color intensity', 'Hue',
       'OD280/OD315 of diluted wines', 'Proline'],
      dtype='object')



In [147]:

    
x.index









    Out[147]:





RangeIndex(start=0, stop=178, step=1)



In [148]:

    
x_scaleddf=pd.DataFrame(data=x_scaled,index=x.index,columns=x.columns)



In [149]:

    
# K Means Cluster
model2 = KMeans(n_clusters=3)
model2.fit(x_scaleddf)









    Out[149]:





KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)



In [150]:

    
model2.labels_









    Out[150]:





array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])



In [151]:

    
pd.value_counts(model2.labels_)









    Out[151]:





2    65
0    62
1    51
dtype: int64



In [152]:

    
pd.value_counts(y['winetype'])









    Out[152]:





2    71
1    59
3    48
Name: winetype, dtype: int64



In [157]:

    
# We convert all the 1s to 0s and 0s to 1s.
predY2 = np.choose(model2.labels_, [1, 3, 2]).astype(np.int64)



In [158]:

    
pd.value_counts(predY2)









    Out[158]:





2    65
1    62
3    51
dtype: int64



In [159]:

    
sm.accuracy_score(y, predY2)









    Out[159]:





0.9662921348314607



In [160]:

    
sm.confusion_matrix(y, predY2)









    Out[160]:





array([[59,  0,  0],
       [ 3, 65,  3],
       [ 0,  0, 48]])



In [ ]:

	0	1	2	3	4	5	6	7	8	9	10	11	12	13
0	1	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
2	1	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
3	1	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
4	1	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735

	winetype	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
count	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000
mean	1.938202	13.000618	2.336348	2.366517	19.494944	99.741573	2.295112	2.029270	0.361854	1.590899	5.058090	0.957449	2.611685	746.893258
std	0.775035	0.811827	1.117146	0.274344	3.339564	14.282484	0.625851	0.998859	0.124453	0.572359	2.318286	0.228572	0.709990	314.907474
min	1.000000	11.030000	0.740000	1.360000	10.600000	70.000000	0.980000	0.340000	0.130000	0.410000	1.280000	0.480000	1.270000	278.000000
25%	1.000000	12.362500	1.602500	2.210000	17.200000	88.000000	1.742500	1.205000	0.270000	1.250000	3.220000	0.782500	1.937500	500.500000
50%	2.000000	13.050000	1.865000	2.360000	19.500000	98.000000	2.355000	2.135000	0.340000	1.555000	4.690000	0.965000	2.780000	673.500000
75%	3.000000	13.677500	3.082500	2.557500	21.500000	107.000000	2.800000	2.875000	0.437500	1.950000	6.200000	1.120000	3.170000	985.000000
max	3.000000	14.830000	5.800000	3.230000	30.000000	162.000000	3.880000	5.080000	0.660000	3.580000	13.000000	1.710000	4.000000	1680.000000