Stick breaking process is the constructive definition of Dirichlet process.
$$ \text{for k=1 ... }\infty \\ V_k \sim Beta(1, \alpha)\\ \pi_k = V_k\prod_{j=1}^{k-1}(1-V_j) \\ \theta_k \sim H \\ G_0 = \sum_{k=1}^{\infty} \pi_k \delta_{\theta_k} $$Draw results of stick breaking process with truncation level T and concentration parameter alpha
In [1]:
%pylab inline
In [2]:
#stick breaking process, return stick lengths
def sbp(_alpha, _T):
V = np.random.beta(1,_alpha,size=_T)
V[_T-1]=1
oneV = np.ones(_T)
oneV[1:] = 1-V[:_T-1]
return V*np.cumprod(oneV)
In [3]:
# concentration parameter alpha
alpha_1 = 10
alpha_2 = 1
alpha_3 = 0.1
# truncation level
T = 50
# draw
p1 = sbp(alpha_1, T)
p2 = sbp(alpha_2, T)
p3 = sbp(alpha_3, T)
In [4]:
#subplots, figsize = col_size x row_size
plt.subplots(1,3, figsize=(18,4))
plt.subplot(1,3,1)
plt.bar(range(T), p1, )
plt.title('alpha = %f' % alpha_1)
plt.subplot(1,3,2)
plt.bar(range(T), p2)
plt.title('alpha = %f' % alpha_2)
plt.subplot(1,3,3)
plt.bar(range(T), p3)
plt.title('alpha = %f' % alpha_3)
Out[4]:
In [5]:
#normalized gamma process
def ngp(_alpha, _T):
V = np.random.gamma(1,_alpha,size=_T)
return (V, V/np.sum(V))
In [6]:
alpha = 1
T1 = 50
T2 = 100
T3 = 200
v1,p1 = ngp(alpha, T1)
v2,p2 = ngp(alpha, T2)
v3,p3 = ngp(alpha, T3)
#subplots, figsize = col_size x row_size
plt.subplots(1,3, figsize=(18,4))
plt.subplot(1,3,1)
plt.bar(range(T1), v1, )
plt.title('T = %f' % T1)
plt.subplot(1,3,2)
plt.bar(range(T2), v2)
plt.title('T = %f' % T2)
plt.subplot(1,3,3)
plt.bar(range(T3), v3)
plt.title('T = %f' % T3)
Out[6]:
In [7]:
from scipy.stats import multivariate_normal
n = 60 #n should be multiple of three
x = np.zeros([n,2])
color = ['r','b','g','m','c','y','k']
n_c = n/3
First cluster:
$x \sim \text{Mulrivariate-Normal}([2,2], 0.5*I)$
Second cluster:
$x \sim \text{Mulrivariate-Normal}([-2,-2], 0.5*I)$
Third cluster:
$x \sim \text{Mulrivariate-Normal}([2,-2], 0.5*I)$
We randomly generate 20 data points for each cluster.
In [8]:
# we assume three clusters
# mean and variance of original data
mu1 = [2,2]; mu2 = [-2,-2]; mu3 = [2,-2]
sigma = np.identity(2) * 0.5
# randomly generate n/3 data point per cluster
x[:n_c,:] = np.random.multivariate_normal (mu1, sigma, size=n_c)
x[n_c:n_c*2:] = np.random.multivariate_normal (mu2, sigma, size=n_c)
x[n_c*2:,:] = np.random.multivariate_normal(mu3, sigma, size=n_c)
y = [0]*n_c + [1]*n_c + [2]*n_c
In [9]:
# plot randomly generated data to 2-dim space
c_mask = [color[y[i]] for i in xrange(n)]
plt.scatter(x[:,0],x[:,1], c=c_mask)
Out[9]:
In [10]:
table = np.zeros(n) #currently assigned table number for each data
alpha = 0.5 #hyperparameter (concentration parameter)
#set table of first data point
table[0] = 1
Sampling table number of second data point
In [11]:
candidate = list() # currently allocated topic list
prob = list() # probability of being assigned to each table
# sampling table number of second data point
for _t in np.unique(table):
if _t != 0:
candidate.append(int(_t))
# this is not correct marginalized probability, see http://en.wikipedia.org/wiki/Conjugate_prior for correct posterior analysis
mean = np.mean(x[table == _t, :], axis=0)
prob.append( np.sum(table == _t)/(np.sum(table != 0)+alpha) \
* multivariate_normal.pdf(x[1,:],mean, np.identity(2)) )
# get new table number
def get_new_table_no():
u_t = set(np.unique(table))
for i in xrange(1,1000):
if not i in u_t:
return i
# compute probability of being assigned to new table
new_table = get_new_table_no()
candidate.append(new_table)
prob.append( alpha/(np.sum(table != 0)+alpha) \
* multivariate_normal.pdf(x[1,:], np.zeros(2), np.identity(2)) )
# normalize probability
prob = np.array(prob)
prob /= np.sum(prob)
Candidate tables and corresponding probabilities
In [12]:
print candidate
print prob
In [13]:
# draw a random sample from multinomial distribution parameterized by prob
sample = np.random.multinomial(1, prob)
# new table number for second data point
table_idx = sample.argmax()
table[1] = candidate[table_idx]
# table assignment of first two data points
table
Out[13]:
In [14]:
max_iter=100
number_of_clusters = list()
for s_iter in xrange(max_iter):
for idx in xrange(n):
candidate = list()
prob = list()
table[idx] = 0
for _t in np.unique(table):
if _t != 0:
candidate.append(int(_t))
# this is not correct marginalized probability, see http://en.wikipedia.org/wiki/Conjugate_prior for correct posterior analysis
mean = np.mean(x[table == _t, :], axis=0) #mean of currently assigned data
prob.append( np.sum(table == _t)/(np.sum(table != 0)+alpha) \
* multivariate_normal.pdf(x[idx,:], mean, np.identity(2)) )
# compute probability of being assigned to new table
new_table = get_new_table_no()
candidate.append(new_table)
prob.append( alpha/(np.sum(table != 0)+alpha) \
* multivariate_normal.pdf(x[idx,:], np.zeros(2), np.identity(2)) )
# normalize probability
prob = np.array(prob)
prob /= np.sum(prob)
#sample table index
sample = np.random.multinomial(1, prob)
table_idx = sample.argmax()
table[idx] = candidate[table_idx]
#for tracking purpose
number_of_clusters.append(len(np.unique(table)))
In [15]:
c_mask = [color[int(table[i])%len(color)] for i in xrange(n)]
plt.scatter(x[:,0],x[:,1], c=c_mask)
Out[15]:
Plot number of clusters through the iterations
In [16]:
plt.plot(range(max_iter), number_of_clusters)
Out[16]:
In [17]:
np.mean(number_of_clusters)
Out[17]:
In [18]:
np.unique(table)
Out[18]:
In [19]:
for t in np.unique(table):
print 'Table', int(t)
print 'number of data points assigned to this table', np.sum(table==t)
print np.mean(x[table==t,:], axis=0)
print
In [ ]: