In [2]:
%pylab inline
In [3]:
import nltk
from tethne.readers import zotero
import matplotlib.pyplot as plt
from helpers import normalize_token, filter_token
import numpy as np
import pandas as pd
import pymc
In [4]:
text_root = '../data/SystemsBiology'
documents = nltk.corpus.PlaintextCorpusReader(text_root, '.+.txt')
In [6]:
document_index = pd.DataFrame(columns=['Year', 'File'])
for i, fileid in enumerate(documents.fileids()):
year = int(fileid[:4])
document_index.loc[i] = [year, fileid]
In [7]:
organism_data = pd.DataFrame(columns=['Year', 'Count'])
i = 0
for year, indices in document_index.groupby('Year').groups.items():
fileids = document_index.ix[indices].File
tokens = [normalize_token(token) # The tokens in this year.
for token in documents.words(fileids=fileids)
if filter_token(token)]
N = len(tokens) # Number of tokens in this year.
chunk_size = 200 # This shouldn't be too large.
for x in xrange(0, N, chunk_size):
current_chunk = tokens[x:x+chunk_size]
count = nltk.FreqDist(current_chunk)['organism']
# Store the counts for this chunk as an observation.
organism_data.loc[i] = [year, count]
i += 1 # Increment the index variable.
In [8]:
# Instead of using values in the 2000s, we'll use the
# offset from the start. So 2003 is year 0, 2004 is
# year 1, etc.
X = organism_data.Year - organism_data.Year.min()
# This will create a new rate~Gamma node for each year.
rates = pymc.Container([pymc.Gamma('rate_%i' % year, alpha=1., beta=1.)
for year in organism_data.Year.unique()])
@pymc.deterministic
def rate(r=rates):
return np.array([rates[int(x)] for x in X])
word_count = pymc.Poisson('word_count',
mu=rate,
value=organism_data.Count,
observed=True)
model = pymc.Model({
'rates': rates,
'word_count': word_count
})
In [9]:
M2 = pymc.MCMC(model)
In [10]:
M2.sample(iter=20000, burn=1000, thin=10)
In [11]:
pymc.Matplot.plot(M2)
In [1]:
plt.figure(figsize=(10, 4))
means = organism_data.groupby('Year').mean()
# Plot the means of the observed data in orange.
plt.scatter(means.index, means.Count,
color='orange',
s=50,
label='Mean (rate) from data')
# Plot the means of the samples in blue.
plt.scatter(means.index, [dist.trace()[:].mean()
for dist in M2.rates],
label='Mean (rate) from samples')
# Plot the 95% credible interval as box/whiskers.
plt.boxplot([dist.trace()[:] for dist in M2.rates],
positions=organism_data.Year.unique(),
whis=[2.5, 97.5],
showfliers=False)
plt.legend(loc='best')
plt.show()
In [ ]: