wine.csv
in the data folder.KMeans
where n_clusters = 3
and compare the clusters to the Wine
column.KMeans
and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine
column.
In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
%matplotlib inline
In [3]:
dataset = pd.read_csv('../data/wine.csv')
dataset.info()
In [4]:
#The First Column contains the Wine Category. Don't use it in the models below. We are going to treat it as unsupervized learning and compare the results to the Wine column.
subset = dataset.iloc[:,1:14]
subset.info()
In [5]:
#Try KMeans where n_clusters = 3 and compare the clusters to the Wine column.
kSampler = KMeans(n_clusters = 3)
kSampler.fit(subset)
Out[5]:
In [6]:
prediction = kSampler.predict(subset)
prediction
Out[6]:
In [7]:
correct = dataset.iloc[:,0]
cmatrix = correct.as_matrix(columns=None)
cmatrix
Out[7]:
In [8]:
#we first need to map cmatrix to 0 to 3
normalizer = np.vectorize(lambda x: x%3)
cmatrix = normalizer(cmatrix)
cmatrix
Out[8]:
In [9]:
#so the main question is how to numbers from one match to the other
#we can create a matrix of the counts and then pick the best count
from sklearn.metrics import confusion_matrix
confusion_matrix(cmatrix, prediction)
Out[9]:
In [10]:
#so it looks like 0->2,1->1,2->0
#we can create a function that will map it for us
def switchGuess(x):
x = x%3
if (x==0):
return 2
if (x==1):
return 1
return 0
mapper = np.vectorize(switchGuess)
diagonalPrediction = mapper(prediction)
confusion_matrix(cmatrix, diagonalPrediction)
Out[10]:
In [11]:
#and plot it
def plotconfusion(answer,prediction):
cm = confusion_matrix(answer, prediction)
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
plotconfusion(cmatrix, diagonalPrediction)
In [12]:
#lets build somthing that scores a confusion matrix that fits the best
import itertools
def score_matrix(cm):
retval = 0
for x in range(0,3):
for y in range(0,3):
if (x != y):
retval = retval + cm[x,y]
return retval
def bestMatch(a,b,verbose=0):
bestscore = -1
bestmatrix = 0
for perm in itertools.permutations([0,1,2]):
def switchGuess(x):
x = x%3
return perm[x]
mapper = np.vectorize(switchGuess)
diagonalPrediction = mapper(b)
cm = confusion_matrix(a, diagonalPrediction)
checkval = score_matrix(cm)
if (bestscore < 0):
bestscore = checkval
bestmatrix = cm
if (verbose):
print "intial", cm, checkval
if(checkval<bestscore):
bestscore = checkval
bestmatrix = cm
if (verbose):
print "better", cm, checkval
return bestmatrix
bestMatch(cmatrix, prediction,1)
Out[12]:
In [13]:
#we are goint to keep various models around to see how they do
modelRuns = {}
def runkSample(name,x,verbose=0):
proSampler = KMeans(n_clusters = 3)
proSampler.fit(x)
proprediction = proSampler.predict(x)
if (verbose):
print "shape:", x.shape
ret = bestMatch(cmatrix, proprediction,verbose)
modelRuns[name] = (x,ret,len(modelRuns))
if (verbose):
print "matrix:", ret
return ret
runkSample("All",subset)
Out[13]:
In [14]:
#Try PCA and see how much can you reduce the variable space.
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(subset)
plt.plot(pca.explained_variance_ratio_);
In [15]:
#hard to believe it is one
pca = PCA(1)
X2_few_wine = pca.fit_transform(subset)
X2_few_wine[0:5]
Out[15]:
In [16]:
runkSample("PCA (All) n=1",X2_few_wine)
Out[16]:
In [17]:
#Lets see how they stack up
def showKMeanScores():
print "Model".ljust(25) + "Col".ljust(4) + "score"
print "-----".ljust(25) + "---".ljust(4) + "-----"
rows = [0] * len(modelRuns)
for name in modelRuns.keys():
x, ret, i = modelRuns[name]
rows[i%len(modelRuns)] = name.ljust(25) + str(x.shape[1]).ljust(4) + str(score_matrix(ret))
for row in rows:
print row
showKMeanScores()
In [18]:
#lets see how everyone does against the pca
#the first chart is how it is grouped
for i in range(0,14):
x = dataset.iloc[:,i]
name = dataset.columns.values[i]
fig = plt.figure();
ax = fig.add_subplot(1, 1, 1)
ax.scatter(x,X2_few_wine)
ax.set_title("(" + str(i) +") " +name)
In [18]:
#it looks like all of the information is in the proline colum
proline = subset.iloc[:,12:]
runkSample("Proline",proline)
Out[18]:
In [19]:
#lets try to see how we do without proline
amateurColumns = dataset.iloc[:,1:13]
other = runkSample("All (no Proline)",amateurColumns)
In [20]:
showKMeanScores()
In [21]:
pca = PCA()
X2_amateur = pca.fit_transform(amateurColumns)
plt.plot(pca.explained_variance_ratio_);
In [22]:
runkSample("PCA (no Proline) n=12",X2_amateur)
Out[22]:
In [23]:
#now just with one
amateurPCA = PCA(1)
amateurSingle = amateurPCA.fit_transform(amateurColumns)
for i in range(0,13):
x = subset.iloc[:,i]
name = "(" + str(i) +") " +subset.columns.values[i]
fig = plt.figure();
ax = fig.add_subplot(1, 1, 1)
ax.scatter(x,amateurSingle)
ax.set_title(name)
In [36]:
#it looks like now all of the info is coming from mg
In [ ]:
In [24]:
#and then it dawns on me this is because I haven't normalized
from sklearn.preprocessing import normalize
#we are goint to keep various models around to see how they do
modelNormRuns = {}
def runNormkSample(name,xnotNorm,verbose=0):
proSampler = KMeans(n_clusters = 3)
x = normalize(xnotNorm,axis = 0)
proSampler.fit(x)
proprediction = proSampler.predict(x)
if (verbose):
print "shape:", x.shape
ret = bestMatch(cmatrix, proprediction,verbose)
modelNormRuns[name] = (x,ret,len(modelNormRuns))
if (verbose):
print "matrix:", ret
return ret
def showNormScores():
print "Model".ljust(25) + "Col".ljust(4) + "Score".ljust(10) + "Prior".ljust(10)
print "-----".ljust(25) + "---".ljust(4) + "-----".ljust(10) + "-----".ljust(10)
rows = [0] * len(modelNormRuns)
prior = [0] * len(modelNormRuns)
for name in modelNormRuns.keys():
x, ret, i = modelNormRuns[name]
base = name.ljust(25) + str(x.shape[1]).ljust(4) + str(score_matrix(ret)).ljust(10)
other = "NA"
if (name in modelRuns.keys()):
px, pret, pi = modelRuns[name]
other = str(score_matrix(pret))
rows[i%len(modelNormRuns)] = base + other.ljust(10)
for row in rows:
print row
In [26]:
In [25]:
modelNormRuns = {}
def migrateToNorm():
nameNorm = [0] * len(modelRuns)
xValNorm = [0] * len(modelRuns)
for name in modelRuns.keys():
# print "***** "+ name
x, ret, i = modelRuns[name]
#runNormkSample(name,x,1)
nameNorm[i]= name
xValNorm[i]=x
for i in range(0,len(modelRuns)):
name = nameNorm[i]
x = xValNorm[i]
runNormkSample(name,x)
migrateToNorm()
showNormScores()
In [27]:
#it looks like all is significantly better when normalized. Removing proline, it was much better normalized as well.
#For the single columns it was much beter not normalized
In [26]:
#lets do some plots of the difference for fun.
best_x, best_ret, best_i = modelNormRuns["All"]
print best_ret
#plotconfusion(cmatrix, best_ret)
plt.matshow(best_ret)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [27]:
best_x, best_ret, best_i = modelRuns["All"]
plt.matshow(best_ret)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
Out[27]:
In [28]:
#so now the quesion is under normalization how many dimensions
normset = normalize(subset,axis = 0)
pca = PCA()
pca.fit(normset)
plt.plot(pca.explained_variance_ratio_);
In [29]:
#we can also do our runs
for i in range(1,12):
iPCA = PCA(i)
iPCGuess = iPCA.fit_transform(normset)
name = "PCA (normizing) n=(" + str(i) +") "
runkSample(name,iPCGuess)
showKMeanScores()
In [30]:
#So now it looks like you need 4.
#lets do the full table of it
modelNormRuns = {}
migrateToNorm()
showNormScores()
In [31]:
In [39]:
# compute distance matrix
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
# not printed as pretty, but the values are correct
def makedenogramplot(a,b):
distx = squareform(pdist(a, metric='euclidean'))
R = dendrogram(linkage(distx, method='single'), color_threshold=100, truncate_mode="level",p=4)
plt.xlabel('points')
plt.ylabel('Height')
plt.suptitle('Cluster Dendrogram of '+b, fontweight='bold', fontsize=14);
makedenogramplot(subset,"All")
In [34]:
#for name in modelRuns.keys():
# x,ret,i = modelRuns[name]
# fig = plt.figure();
# makedenogramplot(x,name)
makedenogramplot(normset,"All")
In [35]:
modelRuns.keys()
Out[35]:
In [36]:
name ='PCA (normizing) n=(4) '
x,ret,i = modelRuns[name]
makedenogramplot(x,name)
In [37]:
#this was the best run
name ='PCA (normizing) n=(4) '
x,ret,i = modelNormRuns[name]
makedenogramplot(x,name)