In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

%matplotlib inline

%precision 2
pd.set_option('display.precision', 3)

import ndl,sim
from zt import ztnbinom

Set up compute cluster and initialize its environment. (Make sure it's got all the right versions of the files, especially sim.py!)


In [14]:
from IPython.parallel import Client

rc = Client(profile='home')
dview = rc.direct_view()
dview.block = True
lview = rc.load_balanced_view()
lview.block = True
rc.ids


Out[14]:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [15]:
%%px

import sys
sys.path = ['/home1/malouf/learning'] + sys.path
import sim
#from sim import Simulation

Create data that's distributed like the corpus counts in Ramscar et al.'s PNAS paper (see the Input Modeling notebook for details).


In [4]:
def cues(N):
    card = ztnbinom.rvs(3,.6)
    feats = range(card) + ['exactly%d'%card]
    return [feats,codeFunc(card)]

In [5]:
ns = [ztnbinom.rvs(3,.6) for i in xrange(10000)]
data = np.zeros((max(ns)))
for i in ns:
    data[i-1] += 1
data


Out[5]:
array([  3.31e+03,   2.57e+03,   1.77e+03,   1.08e+03,   6.19e+02,
         3.13e+02,   1.65e+02,   7.60e+01,   5.30e+01,   2.00e+01,
         7.00e+00,   3.00e+00,   2.00e+00,   1.00e+00,   1.00e+00])

In [18]:
data = pd.DataFrame(data,columns=['Frequency'],index=range(1,len(data)+1))
data['Cues'] = [range(1,i+1) + ['exactly%d'%i] for i in data.index]
data['Number'] = data.index
data


Out[18]:
Frequency Cues Number
1 3313 [1, exactly1] 1
2 2570 [1, 2, exactly2] 2
3 1773 [1, 2, 3, exactly3] 3
4 1084 [1, 2, 3, 4, exactly4] 4
5 619 [1, 2, 3, 4, 5, exactly5] 5
6 313 [1, 2, 3, 4, 5, 6, exactly6] 6
7 165 [1, 2, 3, 4, 5, 6, 7, exactly7] 7
8 76 [1, 2, 3, 4, 5, 6, 7, 8, exactly8] 8
9 53 [1, 2, 3, 4, 5, 6, 7, 8, 9, exactly9] 9
10 20 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, exactly10] 10
11 7 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, exactly11] 11
12 3 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, exactl... 12
13 2 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ex... 13
14 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... 14
15 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... 15

15 rows × 3 columns


In [19]:
%%time
r = sim.experiment(data, P=200, view=lview)


CPU times: user 43.2 s, sys: 12.6 s, total: 55.8 s
Wall time: 3min 56s

In [20]:
sim.all_results(r)


Now add a background feature (basically an intercept)


In [8]:
data['Cues'] = [['background'] + cues for cues in data['Cues']]
data


Out[8]:
Frequency Cues Number Outcomes
1 3313 [background, 1, exactly1] 1 notdu
2 2570 [background, 1, 2, exactly2] 2 du
3 1773 [background, 1, 2, 3, exactly3] 3 notdu
4 1084 [background, 1, 2, 3, 4, exactly4] 4 notdu
5 619 [background, 1, 2, 3, 4, 5, exactly5] 5 notdu
6 313 [background, 1, 2, 3, 4, 5, 6, exactly6] 6 notdu
7 165 [background, 1, 2, 3, 4, 5, 6, 7, exactly7] 7 notdu
8 76 [background, 1, 2, 3, 4, 5, 6, 7, 8, exactly8] 8 notdu
9 53 [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, exactly9] 9 notdu
10 20 [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ex... 10 notdu
11 7 [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11... 11 notdu
12 3 [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11... 12 notdu
13 2 [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11... 13 notdu
14 1 [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11... 14 notdu
15 1 [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11... 15 notdu

15 rows × 4 columns


In [9]:
%%time
r2 = sim.experiment(data, P=200, view=lview)
sim.all_results(r2)


CPU times: user 47.1 s, sys: 11.5 s, total: 58.7 s
Wall time: 4min 12s


In [16]:
ns = [ztnbinom.rvs(3,.45) for i in xrange(10000)]
data2 = np.zeros((max(ns)))
for i in ns:
    data2[i-1] += 1
data2 = pd.DataFrame(data2,columns=['Frequency'],index=range(1,len(data2)+1))
data2['Cues'] = [range(1,i+1) + ['exactly%d'%i] for i in data2.index]
data2['Number'] = data2.index
data2


Out[16]:
Frequency Cues Number
1 1653 [1, exactly1] 1
2 1877 [1, 2, exactly2] 2
3 1637 [1, 2, 3, exactly3] 3
4 1363 [1, 2, 3, 4, exactly4] 4
5 1035 [1, 2, 3, 4, 5, exactly5] 5
6 793 [1, 2, 3, 4, 5, 6, exactly6] 6
7 559 [1, 2, 3, 4, 5, 6, 7, exactly7] 7
8 361 [1, 2, 3, 4, 5, 6, 7, 8, exactly8] 8
9 265 [1, 2, 3, 4, 5, 6, 7, 8, 9, exactly9] 9
10 149 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, exactly10] 10
11 108 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, exactly11] 11
12 79 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, exactl... 12
13 47 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ex... 13
14 36 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... 14
15 14 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... 15
16 10 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... 16
17 5 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... 17
18 5 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... 18
19 0 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... 19
20 3 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... 20
21 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... 21

21 rows × 3 columns


In [17]:
%%time
r3 = sim.experiment(data, P=200, view=lview)
sim.all_results(r3)


CPU times: user 43 s, sys: 12.6 s, total: 55.5 s
Wall time: 3min 59s

In [ ]: