In [13]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

%matplotlib inline


mpl.rc("savefig", dpi=100)

sns.set_style("white")

greys = sns.color_palette("Greys", 10)

data  = pd.read_csv('../data/yeast.csv', header=None).values
data = pd.read_csv('../data/fittedVals.csv',index_col=0)
labels = data.columns
data = data.values

In [ ]:
data =

In [15]:
#k-means witk sklearn
kmeans = KMeans(n_clusters=3,n_init=1,algorithm='full')
kmeans.fit(data)

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=2, svd_solver='full')
pca.fit(data)

pcs   = pca.transform(data)
cntrs = pca.transform(kmeans.cluster_centers_)

mycmap=matplotlib.colors.ListedColormap(greys[3:])

x,y=pcs[:,0],pcs[:,1]

# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)

plt.figure(figsize=(6,6),dpi=100)
plt.scatter(x, y, c=z, s=30, edgecolor='white',cmap=mycmap)
plt.scatter(cntrs[:,0],cntrs[:,1],facecolor='red',edgecolor='black',s=120,alpha=0.95)
plt.xlabel('PC1')
plt.xlabel('PC2')


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-15-fb7a8f8774b7> in <module>()
     18 # Calculate the point density
     19 xy = np.vstack([x,y])
---> 20 z = gaussian_kde(xy)(xy)
     21 
     22 plt.figure(figsize=(6,6),dpi=100)

/Users/kareemcarr/anaconda/lib/python3.5/site-packages/scipy/stats/kde.py in evaluate(self, points)
    212                 tdiff = dot(self.inv_cov, diff)
    213                 energy = sum(diff*tdiff,axis=0) / 2.0
--> 214                 result = result + exp(-energy)
    215         else:
    216             # loop over points

KeyboardInterrupt: 

In [12]:
#score
kmeans.inertia_


Out[12]:
80.644387968906543

In [14]:
K=3
N,D=data.shape

A = np.zeros((K,D))
W = np.zeros(N,dtype=np.int)
X = data
m = np.zeros(K)

#randomization where we make sure there are points in every cluster
for n in range(N):
    W[n] = n%K
    
def shuffle(x,n):
    for i in range(n-2,-1,-1): #from n-2 to 0
        j= np.random.randint(0,i+1) #from 0<=j<=i
        temp = x[j]
        x[j] = x[i]
        x[i] = temp

shuffle(W,len(W))

converged = False

while not converged:
    converged = True
    
    #compute means
    for k in range(K):
        for d in range(D):
            A[k,d] = 0
        m[k]=0
            
    for n in range(N):
        for d in range(D):
            A[W[n],d]+=X[n,d]
        m[ W[n] ] +=1
    
    for k in range(K):
        for d in range(D):
            A[k,d] = A[k,d]/m[k]
            
    #assign to closest mean
    for n in range(N):
        
        min_val = np.inf
        min_ind = -1
        
        for k in range(K):
            temp =0
            for d in range(D):
                temp += (X[n,d]-A[k,d])**2
            
            if temp < min_val:
                min_val = temp
                min_ind = k
                
        if min_ind != W[n]:
            W[n] = min_ind
            converged=False

In [15]:
pca = PCA(n_components=2, svd_solver='full')
pca.fit(data)

pcs   = pca.transform(data)
cntrs = pca.transform(A)

mycmap=matplotlib.colors.ListedColormap(greys[3:])

x,y=pcs[:,0],pcs[:,1]

# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)

plt.figure(figsize=(6.5,6.5))
plt.scatter(x, y, c=z, s=30, edgecolor='white',cmap=mycmap)
plt.scatter(cntrs[:,0],cntrs[:,1],facecolor='red',edgecolor='black',s=120,alpha=0.95)
plt.xlabel('PC1')
plt.xlabel('PC2')


Out[15]:
<matplotlib.text.Text at 0x1113fa860>

In [16]:
phi=0

for n in range(N):
    for d in range(D):
        phi += (X[n,d]-A[W[n],d])**2
print(phi)


80.6443879689

In [ ]: