In [1]:
import numpy as np

In [2]:
cpt = np.random.rand(3,2)
# normalize
cpt[0,0] = 1
cpt[2,0] = 2
const = np.sum(cpt)
cpt = cpt/const
print cpt
x1 = [1,2]
x2 = [1,2,3]


[[ 0.17746699  0.11989266]
 [ 0.1176311   0.13476761]
 [ 0.35493399  0.09530765]]

In [4]:
tmpCpt


Out[4]:
array([ 0.65003208,  0.34996792])

In [3]:
N = 1000
data = np.zeros((2, N))
# apply gibbs sampling to generate the data
x0 = [0,1]
tmpCpt = np.sum(cpt, 0)
for i in xrange(N):
    # sample one from it
    data[0,i] = int(np.random.choice(x1, 1, p=tmpCpt.tolist()))
    # print tmpCpt.tolist()
    tmpCpt2 = cpt[:,data[0,i]-1].tolist()
    data[1,i] = int(np.random.choice(x2, 1,p=tmpCpt2 / np.sum(tmpCpt2)))
    # print cpt[:,data[0,i]-1].tolist()
p = 0.3 # percent of unknown data
mask = np.random.rand(2, N)

data[mask < p] = 0
print data


[[ 0.  1.  2. ...,  1.  1.  1.]
 [ 2.  3.  2. ...,  2.  3.  3.]]
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:10: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

In [5]:
sum(data[0,:] == 1)


Out[5]:
462

In [6]:
with open ("data.txt","w")as fp:
    for line in data:
        print line
        for i in xrange(len(line)):
            if i != len(line) - 1:
                fp.write(str(int(line[i])) + ", ")
            else:
                fp.write(str(int(line[i])))
        fp.write("\n")


[ 0.  1.  2.  1.  1.  1.  0.  0.  1.  2.  0.  0.  0.  1.  2.  0.  1.  2.
  1.  0.  2.  2.  0.  2.  1.  2.  1.  0.  1.  2.  1.  0.  1.  0.  1.  1.
  2.  2.  2.  1.  1.  2.  0.  1.  0.  1.  1.  0.  0.  0.  0.  0.  0.  0.
  1.  1.  1.  0.  1.  1.  1.  0.  2.  1.  1.  1.  1.  0.  1.  0.  0.  1.
  0.  0.  2.  0.  2.  0.  1.  1.  1.  2.  0.  1.  0.  0.  1.  1.  1.  1.
  2.  2.  0.  2.  1.  1.  1.  0.  1.  1.  0.  0.  1.  1.  0.  1.  0.  0.
  2.  1.  2.  0.  0.  1.  0.  2.  1.  1.  0.  0.  1.  1.  1.  2.  1.  0.
  1.  2.  0.  2.  1.  1.  2.  1.  1.  1.  2.  1.  1.  0.  0.  1.  2.  1.
  1.  1.  1.  1.  1.  2.  1.  2.  1.  2.  0.  1.  1.  1.  1.  0.  1.  1.
  2.  1.  1.  1.  1.  1.  0.  1.  0.  2.  2.  1.  0.  1.  1.  0.  2.  2.
  2.  1.  0.  2.  0.  1.  2.  0.  1.  2.  2.  1.  0.  0.  1.  2.  0.  0.
  1.  0.  1.  2.  1.  1.  1.  1.  0.  0.  1.  0.  1.  1.  2.  1.  2.  1.
  0.  0.  1.  1.  1.  1.  2.  0.  1.  1.  2.  1.  0.  1.  0.  1.  0.  0.
  2.  1.  0.  0.  2.  1.  1.  1.  2.  1.  0.  2.  1.  1.  1.  2.  1.  1.
  1.  1.  1.  1.  0.  1.  0.  2.  1.  2.  1.  0.  0.  0.  2.  1.  2.  1.
  1.  2.  0.  0.  0.  2.  1.  0.  2.  0.  0.  0.  0.  1.  1.  0.  1.  2.
  0.  0.  2.  2.  1.  1.  1.  0.  1.  1.  2.  2.  1.  0.  1.  1.  2.  2.
  1.  1.  1.  2.  1.  0.  2.  1.  1.  1.  1.  0.  0.  1.  0.  1.  1.  2.
  1.  0.  1.  1.  2.  2.  2.  2.  0.  1.  0.  2.  1.  1.  2.  1.  0.  1.
  2.  0.  0.  1.  1.  2.  1.  0.  1.  1.  0.  1.  1.  1.  0.  1.  2.  1.
  2.  1.  2.  2.  0.  0.  0.  1.  0.  1.  1.  1.  2.  1.  0.  1.  1.  2.
  1.  2.  1.  1.  2.  0.  1.  2.  0.  0.  0.  2.  1.  0.  1.  1.  2.  0.
  1.  0.  2.  0.  2.  2.  1.  2.  0.  1.  1.  1.  0.  2.  0.  2.  1.  0.
  1.  0.  2.  1.  0.  1.  1.  2.  2.  0.  0.  0.  1.  0.  1.  2.  0.  1.
  1.  0.  0.  1.  1.  1.  0.  1.  2.  2.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  0.  1.  2.  0.  2.  1.  2.  0.  2.  1.  1.  2.  1.  2.  2.  2.  0.
  1.  2.  2.  1.  0.  2.  0.  1.  2.  1.  0.  0.  1.  0.  0.  0.  0.  1.
  1.  2.  0.  1.  2.  0.  1.  1.  2.  1.  0.  1.  1.  1.  2.  2.  2.  2.
  1.  0.  0.  0.  0.  1.  2.  1.  2.  2.  1.  0.  2.  2.  2.  0.  1.  2.
  0.  1.  2.  0.  1.  2.  2.  0.  0.  1.  2.  0.  0.  1.  1.  0.  2.  1.
  2.  2.  0.  1.  2.  2.  1.  1.  0.  2.  0.  1.  1.  1.  1.  1.  2.  1.
  1.  0.  1.  0.  2.  0.  2.  1.  2.  0.  0.  1.  0.  2.  2.  1.  2.  1.
  0.  0.  0.  2.  1.  0.  0.  1.  2.  0.  0.  2.  1.  2.  2.  1.  0.  1.
  1.  1.  2.  2.  0.  2.  2.  2.  1.  0.  1.  2.  1.  0.  1.  2.  0.  0.
  1.  1.  1.  1.  1.  0.  1.  2.  2.  0.  1.  2.  1.  1.  1.  0.  0.  0.
  0.  1.  0.  1.  1.  0.  1.  1.  0.  2.  2.  1.  1.  1.  1.  0.  1.  2.
  1.  2.  2.  0.  1.  0.  1.  1.  2.  2.  0.  2.  0.  1.  0.  1.  2.  0.
  1.  1.  2.  2.  0.  1.  2.  2.  1.  0.  1.  1.  0.  2.  1.  1.  1.  0.
  1.  1.  0.  1.  1.  1.  2.  2.  1.  1.  2.  1.  2.  2.  2.  1.  1.  1.
  2.  1.  1.  1.  2.  2.  1.  1.  0.  1.  1.  1.  1.  0.  1.  1.  2.  1.
  1.  1.  2.  0.  2.  0.  1.  2.  0.  1.  1.  2.  1.  1.  1.  1.  0.  2.
  1.  0.  0.  1.  1.  0.  0.  1.  1.  2.  2.  2.  1.  1.  2.  1.  2.  1.
  1.  1.  2.  0.  0.  0.  2.  2.  1.  1.  1.  1.  1.  2.  0.  2.  0.  1.
  0.  1.  1.  0.  1.  1.  1.  0.  1.  0.  0.  0.  2.  1.  2.  1.  1.  2.
  1.  1.  0.  1.  0.  1.  0.  0.  0.  1.  2.  2.  0.  1.  2.  0.  1.  0.
  1.  2.  2.  1.  2.  1.  0.  0.  2.  1.  2.  0.  1.  2.  1.  1.  2.  0.
  1.  1.  2.  0.  0.  1.  1.  2.  0.  1.  0.  2.  0.  0.  1.  0.  2.  2.
  0.  1.  0.  0.  1.  1.  2.  2.  0.  0.  0.  2.  1.  1.  1.  1.  1.  1.
  2.  0.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  0.  0.  1.  1.  0.  1.
  1.  0.  2.  1.  2.  2.  2.  0.  1.  1.  1.  1.  2.  1.  1.  1.  1.  1.
  0.  1.  1.  1.  1.  0.  2.  1.  2.  2.  1.  1.  1.  0.  0.  0.  1.  1.
  1.  1.  0.  2.  2.  1.  0.  2.  2.  0.  1.  2.  2.  2.  1.  0.  2.  1.
  1.  0.  1.  1.  2.  2.  0.  0.  1.  1.  2.  2.  0.  1.  1.  1.  1.  0.
  1.  1.  0.  1.  2.  0.  0.  2.  1.  0.  0.  2.  2.  0.  2.  0.  1.  0.
  2.  2.  1.  1.  0.  1.  1.  0.  0.  2.  1.  1.  2.  0.  1.  2.  1.  1.
  0.  1.  1.  0.  1.  2.  2.  1.  1.  1.]
[ 2.  3.  2.  1.  1.  0.  3.  2.  0.  1.  3.  3.  3.  3.  1.  2.  0.  3.
  3.  1.  3.  1.  2.  1.  3.  1.  1.  3.  0.  2.  0.  1.  0.  0.  2.  2.
  1.  3.  0.  2.  3.  1.  1.  1.  3.  0.  1.  2.  3.  2.  3.  1.  1.  3.
  0.  0.  3.  0.  3.  2.  0.  3.  2.  0.  3.  3.  1.  1.  2.  0.  0.  1.
  0.  0.  0.  3.  1.  1.  0.  1.  0.  0.  3.  0.  0.  1.  1.  2.  3.  3.
  3.  2.  3.  0.  1.  3.  2.  0.  2.  0.  0.  1.  0.  0.  0.  3.  3.  2.
  2.  3.  1.  1.  0.  3.  0.  3.  3.  3.  1.  3.  3.  2.  1.  0.  1.  3.
  1.  1.  0.  3.  1.  1.  1.  3.  1.  0.  0.  1.  1.  0.  0.  0.  0.  2.
  3.  3.  3.  1.  3.  1.  0.  3.  0.  2.  1.  3.  3.  2.  0.  0.  3.  3.
  3.  0.  1.  0.  3.  2.  0.  1.  2.  2.  1.  2.  3.  1.  1.  3.  2.  1.
  1.  3.  2.  3.  3.  1.  0.  3.  0.  3.  2.  3.  2.  2.  3.  2.  0.  3.
  3.  1.  1.  0.  2.  3.  3.  3.  2.  3.  2.  2.  3.  3.  0.  3.  1.  0.
  0.  3.  0.  3.  2.  3.  2.  1.  3.  3.  2.  3.  3.  3.  3.  2.  3.  0.
  1.  1.  2.  3.  1.  3.  3.  3.  2.  2.  3.  2.  3.  0.  2.  3.  3.  2.
  0.  0.  3.  1.  1.  1.  1.  1.  3.  3.  3.  1.  0.  3.  1.  0.  1.  0.
  2.  0.  2.  3.  3.  0.  1.  0.  2.  0.  3.  0.  0.  0.  3.  2.  1.  0.
  0.  0.  0.  0.  2.  0.  1.  1.  3.  3.  1.  0.  0.  1.  0.  3.  0.  2.
  1.  0.  0.  3.  3.  1.  2.  2.  3.  3.  1.  1.  3.  0.  1.  0.  3.  2.
  3.  2.  3.  0.  0.  0.  1.  1.  3.  0.  3.  2.  1.  1.  1.  1.  3.  1.
  1.  3.  2.  1.  1.  0.  3.  2.  3.  0.  3.  1.  3.  3.  2.  3.  1.  0.
  0.  2.  3.  0.  3.  1.  3.  2.  2.  1.  0.  3.  1.  2.  0.  0.  1.  0.
  3.  0.  0.  0.  1.  0.  0.  2.  0.  2.  3.  2.  3.  1.  3.  3.  1.  0.
  2.  2.  0.  0.  3.  0.  1.  2.  1.  0.  1.  3.  3.  3.  2.  3.  1.  0.
  3.  2.  0.  1.  1.  0.  0.  2.  0.  2.  3.  1.  3.  3.  0.  1.  3.  0.
  3.  1.  3.  3.  0.  2.  2.  1.  1.  1.  0.  3.  3.  3.  0.  3.  3.  0.
  0.  1.  3.  0.  0.  3.  3.  0.  2.  1.  0.  2.  2.  0.  2.  1.  0.  1.
  1.  0.  3.  0.  0.  1.  3.  0.  0.  3.  0.  1.  3.  0.  3.  3.  1.  3.
  3.  0.  2.  1.  0.  2.  1.  0.  2.  1.  0.  3.  2.  0.  3.  1.  1.  1.
  2.  1.  3.  1.  2.  0.  3.  3.  0.  1.  1.  1.  0.  2.  0.  1.  0.  1.
  2.  1.  2.  0.  3.  2.  3.  0.  0.  0.  2.  0.  2.  0.  3.  2.  2.  3.
  1.  2.  0.  0.  3.  3.  3.  3.  0.  0.  2.  3.  3.  3.  0.  3.  2.  0.
  0.  3.  2.  1.  2.  3.  1.  3.  3.  2.  1.  3.  0.  2.  2.  1.  3.  0.
  0.  0.  2.  0.  0.  3.  2.  3.  0.  0.  2.  1.  0.  0.  0.  0.  1.  3.
  1.  0.  1.  3.  3.  1.  3.  0.  1.  2.  3.  2.  3.  0.  2.  2.  3.  0.
  2.  3.  0.  0.  3.  0.  3.  3.  3.  0.  0.  0.  2.  0.  3.  0.  3.  3.
  0.  1.  0.  0.  1.  1.  0.  2.  2.  1.  0.  1.  1.  1.  0.  1.  3.  2.
  2.  3.  3.  0.  3.  1.  3.  3.  0.  2.  1.  0.  0.  0.  2.  1.  2.  1.
  3.  0.  2.  1.  0.  0.  1.  1.  2.  0.  1.  3.  3.  1.  3.  3.  3.  3.
  3.  1.  3.  3.  3.  3.  2.  0.  0.  3.  1.  1.  3.  0.  3.  1.  2.  3.
  0.  0.  2.  1.  2.  2.  0.  3.  0.  2.  0.  3.  0.  0.  3.  0.  0.  1.
  3.  3.  3.  3.  2.  0.  0.  2.  1.  0.  3.  1.  1.  3.  0.  0.  3.  3.
  0.  2.  3.  3.  3.  0.  3.  1.  2.  0.  0.  1.  2.  3.  2.  0.  0.  0.
  3.  0.  2.  2.  1.  1.  0.  1.  0.  0.  3.  3.  1.  3.  1.  1.  3.  0.
  0.  1.  0.  3.  3.  3.  3.  0.  0.  0.  3.  2.  0.  0.  2.  3.  1.  1.
  0.  0.  1.  0.  3.  0.  0.  0.  3.  2.  3.  1.  0.  3.  2.  3.  3.  0.
  2.  0.  0.  0.  1.  2.  2.  3.  2.  2.  3.  1.  3.  0.  0.  0.  2.  1.
  1.  0.  3.  1.  3.  3.  3.  2.  1.  1.  3.  3.  0.  3.  0.  0.  3.  2.
  1.  1.  1.  2.  1.  3.  0.  1.  0.  1.  0.  1.  0.  3.  1.  0.  2.  3.
  0.  3.  2.  1.  3.  3.  1.  0.  2.  1.  0.  0.  3.  3.  1.  1.  0.  3.
  3.  3.  1.  3.  2.  3.  1.  0.  0.  1.  3.  1.  2.  1.  3.  1.  2.  0.
  3.  0.  2.  0.  3.  1.  0.  0.  0.  0.  0.  1.  3.  0.  0.  0.  1.  3.
  3.  3.  0.  2.  1.  0.  2.  0.  2.  3.  0.  1.  2.  0.  3.  3.  3.  3.
  2.  3.  1.  1.  3.  2.  1.  0.  0.  3.  3.  3.  0.  3.  3.  3.  2.  2.
  0.  0.  1.  2.  1.  0.  2.  0.  3.  3.  2.  2.  2.  2.  0.  3.  0.  0.
  2.  3.  3.  0.  2.  3.  3.  3.  2.  2.  0.  3.  3.  1.  2.  1.  3.  3.
  0.  3.  1.  0.  3.  3.  0.  2.  3.  3.]

In [7]:
tmp = np.array([ 199.48,120.06,338.06,120.88,131.51,95.860])
tmp = tmp / np.sum(tmp)

In [8]:
avgCpt = np.zeros((3, 2))
for i in xrange(N):
    avgCpt[data[1,i]-1, data[0,i]-1] += 1
avgCpt = avgCpt/np.sum(avgCpt)


/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:3: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  app.launch_new_instance()

In [9]:
print tmp
print cpt
print avgCpt
print tmpCpt


[ 0.19831983  0.11936173  0.33609385  0.12017696  0.13074514  0.09530248]
[[ 0.17746699  0.11989266]
 [ 0.1176311   0.13476761]
 [ 0.35493399  0.09530765]]
[[ 0.092  0.127]
 [ 0.056  0.12 ]
 [ 0.314  0.291]]
[ 0.65003208  0.34996792]

In [60]:
i = 1
cpt[:,data[0,i]-1].tolist()


/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:2: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  from ipykernel import kernelapp as app
Out[60]:
[0.0033381790418454296, 0.18905551089099382, 0.048171997949225116]