notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
# In KMeans we assume the variance cluster is equal leading to
# a subdivision of space that determines how the clusters are
# assigned. What about a situation where  the variances are not
# equal and each cluster point has some probabilistic association?



In [3]:

    
# Hard KMeans clustering is the same as applying a Gaussian
# Mixture Model with a covariance matrix which can be factored
# to the error times of the identity matrix.
# This leads to spherical clusters



In [4]:

    
import numpy as np



In [5]:

    
N = 1000
in_m = 72 # average mens height in inches
in_w = 66 # average womens height in inches



In [6]:

    
s_m = 2
s_w = s_m



In [7]:

    
m = np.random.normal(in_m, s_m, N)
w = np.random.normal(in_w, s_w, N)



In [8]:

    
from matplotlib import pyplot as plt



In [9]:

    
f, ax = plt.subplots(figsize=(7,5))
ax.set_title("Histogram of Heights")
ax.hist(m, alpha=.5, label='Men')
ax.hist(w, alpha=.5, label='Women')
ax.legend()









    Out[9]:





<matplotlib.legend.Legend at 0x10c9919d0>



In [10]:

    
random_sample = np.random.choice([True, False], size=m.size)
m_test = m[random_sample]
m_train = m[~random_sample]



In [11]:

    
w_test = w[random_sample]
w_train = w[~random_sample]



In [12]:

    
from scipy import stats



In [13]:

    
m_pdf = stats.norm(m_train.mean(), m_train.std())
w_pdf = stats.norm(w_train.mean(), w_train.std())



In [14]:

    
# calculate based on the likelihood that the data point was
# generated from either distribution, and the most likely
# distribution will get the appropriate label assigned.



In [15]:

    
m_pdf.pdf(m[0])









    Out[15]:





0.20693407851938792



In [16]:

    
w_pdf.pdf(w[0])









    Out[16]:





0.13156319659262075



In [17]:

    
guesses_m = np.ones_like(m_test)
guesses_m[m_pdf.pdf(m_test) < w_pdf.pdf(w_test)] = 0



In [18]:

    
guesses_m.mean()









    Out[18]:





0.51483050847457623



In [19]:

    
guesses_w = np.ones_like(w_test)
guesses_w[m_pdf.pdf(w_test) > w_pdf.pdf(w_test)] = 0
guesses_w.mean()









    Out[19]:





0.94915254237288138



In [20]:

    
# allowing variance to differ between groups:



In [21]:

    
s_m = 1
s_w = 4



In [22]:

    
m = np.random.normal(in_m, s_m, N)
w = np.random.normal(in_w, s_w, N)



In [23]:

    
m_test = m[random_sample]
m_train = m[~random_sample]



In [24]:

    
w_test = w[random_sample]
w_train = w[~random_sample]



In [25]:

    
f, ax = plt.subplots(figsize=(7,5))
ax.set_title("Histogram of Heights")
ax.hist(m_train, alpha=.5, label='Men')
ax.hist(w_train, alpha=.5, label='Women')
ax.legend()









    Out[25]:





<matplotlib.legend.Legend at 0x11066bc50>



In [ ]: