Homework 4:

Follow the steps below to:
- Read wine.csv in the data folder.
- The First Column contains the Wine Category. Don't use it in the models below. We are going to treat it as unsupervized learning and compare the results to the Wine column.
Try KMeans where n_clusters = 3 and compare the clusters to the Wine column.
Try PCA and see how much can you reduce the variable space.
- How many Components did you need to explain 99% of variance in this dataset?
- Plot the PCA variables to see if it brings out the clusters.
Try KMeans and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine column.

Dataset

wine.csv is in data folder under homeworks



In [2]:

    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans

%matplotlib inline



In [3]:

    
dataset = pd.read_csv('../data/wine.csv')
dataset.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 178 entries, 0 to 177
Data columns (total 14 columns):
Wine                    178 non-null int64
Alcohol                 178 non-null float64
Malic.acid              178 non-null float64
Ash                     178 non-null float64
Acl                     178 non-null float64
Mg                      178 non-null int64
Phenols                 178 non-null float64
Flavanoids              178 non-null float64
Nonflavanoid.phenols    178 non-null float64
Proanth                 178 non-null float64
Color.int               178 non-null float64
Hue                     178 non-null float64
OD                      178 non-null float64
Proline                 178 non-null int64
dtypes: float64(11), int64(3)
memory usage: 20.9 KB



In [4]:

    
#The First Column contains the Wine Category. Don't use it in the models below. We are going to treat it as unsupervized learning and compare the results to the Wine column.
subset = dataset.iloc[:,1:14]
subset.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 178 entries, 0 to 177
Data columns (total 13 columns):
Alcohol                 178 non-null float64
Malic.acid              178 non-null float64
Ash                     178 non-null float64
Acl                     178 non-null float64
Mg                      178 non-null int64
Phenols                 178 non-null float64
Flavanoids              178 non-null float64
Nonflavanoid.phenols    178 non-null float64
Proanth                 178 non-null float64
Color.int               178 non-null float64
Hue                     178 non-null float64
OD                      178 non-null float64
Proline                 178 non-null int64
dtypes: float64(11), int64(2)
memory usage: 19.5 KB



In [5]:

    
#Try KMeans where n_clusters = 3 and compare the clusters to the Wine column.
kSampler = KMeans(n_clusters = 3)
kSampler.fit(subset)









    Out[5]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=0)



In [6]:

    
prediction = kSampler.predict(subset)
prediction









    Out[6]:





array([1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1,
       1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 0, 2, 0, 0, 2,
       2, 2, 0, 0, 1, 2, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0,
       0, 0, 0, 2, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 2, 2, 0,
       0, 0, 2, 2, 0, 0, 2, 2, 0, 2, 2, 0, 0, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0,
       2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0], dtype=int32)



In [7]:

    
correct = dataset.iloc[:,0]
cmatrix = correct.as_matrix(columns=None)
cmatrix









    Out[7]:





array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])



In [8]:

    
#we first need to map cmatrix to 0 to 3
normalizer = np.vectorize(lambda x: x%3)
cmatrix = normalizer(cmatrix)
cmatrix









    Out[8]:





array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])



In [9]:

    
#so the main question is how to numbers from one match to the other
#we can create a matrix of the counts and then pick the best count
from sklearn.metrics import confusion_matrix
confusion_matrix(cmatrix, prediction)









    Out[9]:





array([[19,  0, 29],
       [ 0, 46, 13],
       [50,  1, 20]])



In [10]:

    
#so it looks like 0->2,1->1,2->0
#we can create a function that will map it for us
def switchGuess(x):
    x = x%3
    if (x==0):
        return 2
    if (x==1):
        return 1
    return 0
mapper = np.vectorize(switchGuess)
diagonalPrediction = mapper(prediction)
confusion_matrix(cmatrix, diagonalPrediction)









    Out[10]:





array([[29,  0, 19],
       [13, 46,  0],
       [20,  1, 50]])



In [11]:

    
#and plot it
def plotconfusion(answer,prediction):
    cm = confusion_matrix(answer, prediction)
    plt.matshow(cm)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
plotconfusion(cmatrix, diagonalPrediction)



In [12]:

    
#lets build somthing that scores a confusion matrix that fits the best
import itertools
def score_matrix(cm):
    retval = 0
    for x in range(0,3):
        for y in range(0,3):
            if (x != y):
                retval = retval + cm[x,y]
    return retval


def bestMatch(a,b,verbose=0):
    bestscore = -1
    bestmatrix = 0
    for perm in itertools.permutations([0,1,2]):
        
        def switchGuess(x):
            x = x%3
            return perm[x]
        
        mapper = np.vectorize(switchGuess)
        diagonalPrediction = mapper(b)
        cm = confusion_matrix(a, diagonalPrediction)
        checkval = score_matrix(cm)

        if (bestscore < 0):
            bestscore = checkval
            bestmatrix = cm
            if (verbose):
                print "intial", cm, checkval
            
        if(checkval<bestscore):
            bestscore = checkval
            bestmatrix = cm
            if (verbose):
                print "better", cm, checkval
      
    return bestmatrix

bestMatch(cmatrix, prediction,1)









    



intial [[19  0 29]
 [ 0 46 13]
 [50  1 20]] 93
better [[29  0 19]
 [13 46  0]
 [20  1 50]] 53






    Out[12]:





array([[29,  0, 19],
       [13, 46,  0],
       [20,  1, 50]])



In [13]:

    
#we are goint to keep various models around to see how they do
modelRuns = {}

def runkSample(name,x,verbose=0):
    proSampler = KMeans(n_clusters = 3)
    proSampler.fit(x)
    proprediction = proSampler.predict(x)
    if (verbose):
        print "shape:", x.shape
    
    ret = bestMatch(cmatrix, proprediction,verbose)
    modelRuns[name] = (x,ret,len(modelRuns)) 
    if (verbose):
        print "matrix:", ret
    return ret
runkSample("All",subset)









    Out[13]:





array([[29,  0, 19],
       [13, 46,  0],
       [20,  1, 50]])



In [14]:

    
#Try PCA and see how much can you reduce the variable space.
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(subset)
plt.plot(pca.explained_variance_ratio_);



In [15]:

    
#hard to believe it is one
pca = PCA(1)
X2_few_wine = pca.fit_transform(subset)
X2_few_wine[0:5]









    Out[15]:





array([[-318.56297929],
       [-303.09741966],
       [-438.06113292],
       [-733.24013935],
       [  11.5714285 ]])



In [16]:

    
runkSample("PCA (All) n=1",X2_few_wine)









    Out[16]:





array([[29,  0, 19],
       [13, 46,  0],
       [20,  1, 50]])



In [17]:

    
#Lets see how they stack up
def showKMeanScores():
    print "Model".ljust(25) + "Col".ljust(4) + "score"
    print "-----".ljust(25) + "---".ljust(4) + "-----"
    rows = [0] * len(modelRuns)
    for name in modelRuns.keys():
        x, ret, i = modelRuns[name]
        rows[i%len(modelRuns)] =  name.ljust(25) + str(x.shape[1]).ljust(4) + str(score_matrix(ret))   
    for row in rows:
        print row
showKMeanScores()









    



Model                    Col score
-----                    --- -----
All                      13  53
PCA (All) n=1            1   53



In [18]:

    
#lets see how everyone does against the pca
#the first chart is how it is grouped
for i in range(0,14):
    x = dataset.iloc[:,i]
    name = dataset.columns.values[i]
    fig = plt.figure();
    ax = fig.add_subplot(1, 1, 1)
    ax.scatter(x,X2_few_wine)
    ax.set_title("(" + str(i) +") " +name)



In [18]:

    
#it looks like all of the information is in the proline colum
proline = subset.iloc[:,12:]
runkSample("Proline",proline)









    



/Users/arthurconner/anaconda/lib/python2.7/site-packages/sklearn/cluster/k_means_.py:797: RuntimeWarning: Got data type int64, converted to float to avoid overflows
  X = self._check_test_data(X)






    Out[18]:





array([[29,  0, 19],
       [13, 46,  0],
       [20,  1, 50]])



In [19]:

    
#lets try to see how we do without proline
amateurColumns = dataset.iloc[:,1:13]
other = runkSample("All (no Proline)",amateurColumns)



In [20]:

    
showKMeanScores()









    



Model                    Col score
-----                    --- -----
All                      13  53
PCA (All) n=1            1   53
All (no Proline)         12  90



In [21]:

    
pca = PCA()
X2_amateur = pca.fit_transform(amateurColumns)
plt.plot(pca.explained_variance_ratio_);



In [22]:

    
runkSample("PCA (no Proline) n=12",X2_amateur)









    Out[22]:





array([[ 5, 24, 19],
       [15, 35,  9],
       [ 6, 19, 46]])



In [23]:

    
#now just with one
amateurPCA = PCA(1)
amateurSingle = amateurPCA.fit_transform(amateurColumns)
for i in range(0,13):
    x = subset.iloc[:,i]
    name = "(" + str(i) +") " +subset.columns.values[i]
    fig = plt.figure();
    ax = fig.add_subplot(1, 1, 1)
    ax.scatter(x,amateurSingle)
    ax.set_title(name)



In [36]:

    
#it looks like now all of the info is coming from mg



In [ ]:



In [24]:

    
#and then it dawns on me this is because I haven't normalized
from sklearn.preprocessing import normalize

#we are goint to keep various models around to see how they do
modelNormRuns = {}

def runNormkSample(name,xnotNorm,verbose=0):
    proSampler = KMeans(n_clusters = 3)
    x = normalize(xnotNorm,axis = 0)
    proSampler.fit(x)
    proprediction = proSampler.predict(x)
    if (verbose):
        print "shape:", x.shape
    
    ret = bestMatch(cmatrix, proprediction,verbose)
    modelNormRuns[name] = (x,ret,len(modelNormRuns)) 
    if (verbose):
        print "matrix:", ret
    return ret

def showNormScores():

    print "Model".ljust(25) + "Col".ljust(4) + "Score".ljust(10) + "Prior".ljust(10)
    print "-----".ljust(25) + "---".ljust(4) + "-----".ljust(10) + "-----".ljust(10)
    rows = [0] * len(modelNormRuns)
    prior = [0] * len(modelNormRuns)
    for name in modelNormRuns.keys():
        x, ret, i = modelNormRuns[name]
        base = name.ljust(25) + str(x.shape[1]).ljust(4) + str(score_matrix(ret)).ljust(10)

        other = "NA"
        if (name  in modelRuns.keys()):
            px, pret, pi = modelRuns[name]
            other = str(score_matrix(pret))
        rows[i%len(modelNormRuns)] = base + other.ljust(10)
    
    for row in rows:
        print row



In [26]:



In [25]:

    
modelNormRuns = {}

def migrateToNorm():
    nameNorm = [0] * len(modelRuns)
    xValNorm =  [0] * len(modelRuns)

    for name in modelRuns.keys():
       # print "***** "+ name
        x, ret, i = modelRuns[name]
        #runNormkSample(name,x,1) 
        nameNorm[i]= name
        xValNorm[i]=x
    
    for i in range(0,len(modelRuns)):
        name = nameNorm[i]
        x = xValNorm[i]
        runNormkSample(name,x) 
migrateToNorm()
showNormScores()









    



Model                    Col Score     Prior     
-----                    --- -----     -----     
All                      13  13        53        
PCA (All) n=1            1   53        53        
All (no Proline)         12  27        90        
PCA (no Proline) n=12    12  35        92



In [27]:

    
#it looks like all is significantly better when normalized. Removing proline, it was much better normalized as well.
#For the single columns it was much beter not normalized



In [26]:

    
#lets do some plots of the difference for fun.
best_x, best_ret, best_i = modelNormRuns["All"]
print best_ret
#plotconfusion(cmatrix, best_ret)
plt.matshow(best_ret)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()









    



[[47  0  1]
 [ 0 59  0]
 [ 2 10 59]]



In [27]:

    
best_x, best_ret, best_i = modelRuns["All"]
plt.matshow(best_ret)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')









    Out[27]:





<matplotlib.text.Text at 0x10a044610>



In [28]:

    
#so now the quesion is under normalization how many dimensions
normset = normalize(subset,axis = 0)
pca = PCA()
pca.fit(normset)
plt.plot(pca.explained_variance_ratio_);



In [29]:

    
#we can also do our runs
for i in range(1,12):
    iPCA = PCA(i)
    iPCGuess = iPCA.fit_transform(normset)
    name = "PCA (normizing) n=(" + str(i) +") " 
    runkSample(name,iPCGuess)
    
showKMeanScores()









    



Model                    Col score
-----                    --- -----
All                      13  53
PCA (All) n=1            1   53
All (no Proline)         12  90
PCA (no Proline) n=12    12  92
PCA (normizing) n=(1)    1   33
PCA (normizing) n=(2)    2   15
PCA (normizing) n=(3)    3   14
PCA (normizing) n=(4)    4   13
PCA (normizing) n=(5)    5   13
PCA (normizing) n=(6)    6   13
PCA (normizing) n=(7)    7   13
PCA (normizing) n=(8)    8   13
PCA (normizing) n=(9)    9   13
PCA (normizing) n=(10)   10  13
PCA (normizing) n=(11)   11  13



In [30]:

    
#So now it looks like you need 4.
#lets do the full table of it
modelNormRuns = {}
migrateToNorm()
showNormScores()









    



Model                    Col Score     Prior     
-----                    --- -----     -----     
All                      13  13        53        
PCA (All) n=1            1   53        53        
All (no Proline)         12  27        90        
PCA (no Proline) n=12    12  52        92        
PCA (normizing) n=(1)    1   33        33        
PCA (normizing) n=(2)    2   10        15        
PCA (normizing) n=(3)    3   12        14        
PCA (normizing) n=(4)    4   6         13        
PCA (normizing) n=(5)    5   5         13        
PCA (normizing) n=(6)    6   4         13        
PCA (normizing) n=(7)    7   5         13        
PCA (normizing) n=(8)    8   23        13        
PCA (normizing) n=(9)    9   11        13        
PCA (normizing) n=(10)   10  18        13        
PCA (normizing) n=(11)   11  63        13



In [31]:



In [39]:

    
# compute distance matrix
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram

# not printed as pretty, but the values are correct
def makedenogramplot(a,b):
    distx = squareform(pdist(a, metric='euclidean'))
    R = dendrogram(linkage(distx, method='single'), color_threshold=100, truncate_mode="level",p=4)
    plt.xlabel('points')
    plt.ylabel('Height')
    plt.suptitle('Cluster Dendrogram of '+b, fontweight='bold', fontsize=14);
makedenogramplot(subset,"All")



In [34]:

    
#for name in modelRuns.keys():
#        x,ret,i = modelRuns[name]
#        fig = plt.figure();
#        makedenogramplot(x,name)
makedenogramplot(normset,"All")



In [35]:

    
modelRuns.keys()









    Out[35]:





['PCA (normizing) n=(6) ',
 'PCA (normizing) n=(8) ',
 'All',
 'PCA (normizing) n=(3) ',
 'PCA (normizing) n=(4) ',
 'PCA (normizing) n=(9) ',
 'PCA (normizing) n=(7) ',
 'PCA (no Proline) n=12',
 'PCA (normizing) n=(1) ',
 'All (no Proline)',
 'PCA (normizing) n=(2) ',
 'PCA (normizing) n=(11) ',
 'PCA (normizing) n=(5) ',
 'PCA (All) n=1',
 'PCA (normizing) n=(10) ']



In [36]:

    
name ='PCA (normizing) n=(4) '
x,ret,i = modelRuns[name]
makedenogramplot(x,name)



In [37]:

    
#this was the best run
name ='PCA (normizing) n=(4) '
x,ret,i = modelNormRuns[name]
makedenogramplot(x,name)