Bayesian Nonparametric Model

Dirichlet process

Definition: $$ G_0 \sim \text{DP}(\alpha, H) $$ where $\alpha$ is a concentration parameter, and $H$ is a base distribution.

Stick breaking process

Stick breaking process is the constructive definition of Dirichlet process.

$$ \text{for k=1 ... }\infty \\ V_k \sim Beta(1, \alpha)\\ \pi_k = V_k\prod_{j=1}^{k-1}(1-V_j) \\ \theta_k \sim H \\ G_0 = \sum_{k=1}^{\infty} \pi_k \delta_{\theta_k} $$

Draw results of stick breaking process with truncation level T and concentration parameter alpha


In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
#stick breaking process, return stick lengths
def sbp(_alpha, _T):
    V = np.random.beta(1,_alpha,size=_T)
    V[_T-1]=1
    oneV = np.ones(_T)
    oneV[1:] = 1-V[:_T-1]
    return V*np.cumprod(oneV)

In [3]:
# concentration parameter alpha
alpha_1 = 10
alpha_2 = 1
alpha_3 = 0.1
# truncation level
T = 50

# draw 
p1 = sbp(alpha_1, T)
p2 = sbp(alpha_2, T)
p3 = sbp(alpha_3, T)

In [4]:
#subplots, figsize = col_size x row_size
plt.subplots(1,3, figsize=(18,4))

plt.subplot(1,3,1)
plt.bar(range(T), p1, )
plt.title('alpha = %f' % alpha_1)

plt.subplot(1,3,2)
plt.bar(range(T), p2)
plt.title('alpha = %f' % alpha_2)

plt.subplot(1,3,3)
plt.bar(range(T), p3)
plt.title('alpha = %f' % alpha_3)


Out[4]:
<matplotlib.text.Text at 0x10df4b250>

Normalized gamma process


In [5]:
#normalized gamma process
def ngp(_alpha, _T):
    V = np.random.gamma(1,_alpha,size=_T)
    return (V, V/np.sum(V))

In [6]:
alpha = 1
T1 = 50
T2 = 100
T3 = 200

v1,p1 = ngp(alpha, T1)
v2,p2 = ngp(alpha, T2)
v3,p3 = ngp(alpha, T3)

#subplots, figsize = col_size x row_size
plt.subplots(1,3, figsize=(18,4))

plt.subplot(1,3,1)
plt.bar(range(T1), v1, )
plt.title('T = %f' % T1)

plt.subplot(1,3,2)
plt.bar(range(T2), v2)
plt.title('T = %f' % T2)

plt.subplot(1,3,3)
plt.bar(range(T3), v3)
plt.title('T = %f' % T3)


Out[6]:
<matplotlib.text.Text at 0x10f450610>

Dirichlet Process Mixture

Dirichlet Process Mixture example

Likelihood model = two-dimensional normal distribution

Test data generate from two-dimensional normal distribution


In [7]:
from scipy.stats import multivariate_normal 

n = 60 #n should be multiple of three
x = np.zeros([n,2])
color = ['r','b','g','m','c','y','k']
n_c = n/3

First cluster:

$x \sim \text{Mulrivariate-Normal}([2,2], 0.5*I)$

Second cluster:

$x \sim \text{Mulrivariate-Normal}([-2,-2], 0.5*I)$

Third cluster:

$x \sim \text{Mulrivariate-Normal}([2,-2], 0.5*I)$

We randomly generate 20 data points for each cluster.


In [8]:
# we assume three clusters
# mean and variance of original data 
mu1 = [2,2]; mu2 = [-2,-2]; mu3 = [2,-2]
sigma = np.identity(2) * 0.5

# randomly generate n/3 data point per cluster
x[:n_c,:] = np.random.multivariate_normal (mu1, sigma, size=n_c)
x[n_c:n_c*2:] = np.random.multivariate_normal (mu2, sigma, size=n_c)
x[n_c*2:,:] = np.random.multivariate_normal(mu3, sigma, size=n_c)

y = [0]*n_c + [1]*n_c + [2]*n_c

In [9]:
# plot randomly generated data to 2-dim space
c_mask = [color[y[i]] for i in xrange(n)]
plt.scatter(x[:,0],x[:,1], c=c_mask)


Out[9]:
<matplotlib.collections.PathCollection at 0x10dfd6710>

Sampling Code


In [10]:
table = np.zeros(n)    #currently assigned table number for each data

alpha = 0.5 #hyperparameter (concentration parameter)

#set table of first data point
table[0] = 1

Sampling table number of second data point


In [11]:
candidate = list()  # currently allocated topic list
prob = list()       # probability of being assigned to each table


# sampling table number of second data point
for _t in np.unique(table):
    if _t != 0:
        candidate.append(int(_t))
        # this is not correct marginalized probability, see http://en.wikipedia.org/wiki/Conjugate_prior for correct posterior analysis
        mean = np.mean(x[table == _t, :], axis=0)
        prob.append( np.sum(table == _t)/(np.sum(table != 0)+alpha) \
                    * multivariate_normal.pdf(x[1,:],mean, np.identity(2)) )

# get new table number
def get_new_table_no():
    u_t = set(np.unique(table))
    for i in xrange(1,1000):
        if not i in u_t:
            return i

# compute probability of being assigned to new table
new_table = get_new_table_no()
candidate.append(new_table)
prob.append( alpha/(np.sum(table != 0)+alpha) \
            * multivariate_normal.pdf(x[1,:], np.zeros(2), np.identity(2))  )

# normalize probability
prob = np.array(prob)
prob /= np.sum(prob)

Candidate tables and corresponding probabilities


In [12]:
print candidate
print prob


[1, 2]
[ 0.99794869  0.00205131]

In [13]:
# draw a random sample from multinomial distribution parameterized by prob
sample = np.random.multinomial(1, prob)
# new table number for second data point
table_idx = sample.argmax()
table[1] = candidate[table_idx]

# table assignment of first two data points
table


Out[13]:
array([ 1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [14]:
max_iter=100
number_of_clusters = list()

for s_iter in xrange(max_iter):
    for idx in xrange(n):
        candidate = list()
        prob = list()
        
        table[idx] = 0

        for _t in np.unique(table):
            if _t != 0:
                candidate.append(int(_t))
                # this is not correct marginalized probability, see http://en.wikipedia.org/wiki/Conjugate_prior for correct posterior analysis
                mean = np.mean(x[table == _t, :], axis=0) #mean of currently assigned data
                prob.append( np.sum(table == _t)/(np.sum(table != 0)+alpha) \
                            * multivariate_normal.pdf(x[idx,:], mean, np.identity(2)) )

        # compute probability of being assigned to new table
        new_table = get_new_table_no()
        candidate.append(new_table)
        prob.append( alpha/(np.sum(table != 0)+alpha) \
                    * multivariate_normal.pdf(x[idx,:], np.zeros(2), np.identity(2)) )    

        # normalize probability
        prob = np.array(prob)
        prob /= np.sum(prob)

        #sample table index
        sample = np.random.multinomial(1, prob)
        table_idx = sample.argmax()
        table[idx] = candidate[table_idx]
        
    #for tracking purpose
    number_of_clusters.append(len(np.unique(table)))

Plot Results

Plot assignment of final samples


In [15]:
c_mask = [color[int(table[i])%len(color)] for i in xrange(n)]
plt.scatter(x[:,0],x[:,1], c=c_mask)


Out[15]:
<matplotlib.collections.PathCollection at 0x10dffda90>

Plot number of clusters through the iterations


In [16]:
plt.plot(range(max_iter), number_of_clusters)


Out[16]:
[<matplotlib.lines.Line2D at 0x111a8af90>]

In [17]:
np.mean(number_of_clusters)


Out[17]:
3.2200000000000002

In [18]:
np.unique(table)


Out[18]:
array([ 1.,  2.,  3.])

In [19]:
for t in np.unique(table):
    print 'Table', int(t)
    print 'number of data points assigned to this table', np.sum(table==t)
    print np.mean(x[table==t,:], axis=0)
    print


Table 1
number of data points assigned to this table 20
[ 1.99834569  1.75685838]

Table 2
number of data points assigned to this table 20
[-1.99439971 -1.80973143]

Table 3
number of data points assigned to this table 20
[ 1.89358224 -2.15001022]


In [ ]: