In [116]:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
import sklearn.metrics as sm
import pandas as pd
import numpy as np
In [117]:
wine=pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",header=None)
In [118]:
wine.head()
Out[118]:
In [119]:
wine.columns=['winetype','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines','Proline']
In [120]:
wine.head()
Out[120]:
In [121]:
wine.info()
In [122]:
wine.describe()
Out[122]:
In [123]:
pd.value_counts(wine['winetype'])
Out[123]:
In [124]:
x=wine.iloc[:,1:14]
y=wine.iloc[:,:1]
In [125]:
x.columns
Out[125]:
In [126]:
y.columns
Out[126]:
In [127]:
x.head()
Out[127]:
In [128]:
y.head()
Out[128]:
In [129]:
# K Means Cluster
model = KMeans(n_clusters=3)
model.fit(x)
Out[129]:
In [130]:
model.labels_
Out[130]:
In [131]:
pd.value_counts(y['winetype'])
Out[131]:
In [132]:
pd.value_counts(model.labels_)
Out[132]:
In [171]:
# We convert all the 1s to 0s and 0s to 1s.
predY = np.choose(model.labels_, [1,3,2]).astype(np.int64)
In [172]:
pd.value_counts(y['winetype'])
Out[172]:
In [173]:
pd.value_counts(predY)
Out[173]:
In [174]:
# Performance Metrics
sm.accuracy_score(y, predY)
Out[174]:
In [175]:
# Confusion Matrix
sm.confusion_matrix(y, predY)
Out[175]:
In [138]:
!pip install ggplot
#import ggplot as ggplot
In [139]:
from ggplot import *
%matplotlib inline
In [140]:
p = ggplot(aes(x='Alcohol', y='Ash',color="winetype"), data=wine)
p + geom_point()
Out[140]:
In [141]:
p2 = ggplot(aes(x='Alcohol', y='Ash',color="predY"), data=wine)
p2 + geom_point()
Out[141]:
In [142]:
from sklearn import preprocessing
In [143]:
x_scaled = preprocessing.scale(x)
In [144]:
x.head()
Out[144]:
In [145]:
x_scaled
Out[145]:
In [146]:
x.columns
Out[146]:
In [147]:
x.index
Out[147]:
In [148]:
x_scaleddf=pd.DataFrame(data=x_scaled,index=x.index,columns=x.columns)
In [149]:
# K Means Cluster
model2 = KMeans(n_clusters=3)
model2.fit(x_scaleddf)
Out[149]:
In [150]:
model2.labels_
Out[150]:
In [151]:
pd.value_counts(model2.labels_)
Out[151]:
In [152]:
pd.value_counts(y['winetype'])
Out[152]:
In [157]:
# We convert all the 1s to 0s and 0s to 1s.
predY2 = np.choose(model2.labels_, [1, 3, 2]).astype(np.int64)
In [158]:
pd.value_counts(predY2)
Out[158]:
In [159]:
sm.accuracy_score(y, predY2)
Out[159]:
In [160]:
sm.confusion_matrix(y, predY2)
Out[160]:
In [ ]: