In [ ]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [ ]:
from tethne.readers import zotero
import matplotlib.pyplot as plt

In [ ]:
corpus = zotero.read('/Users/erickpeirson/Projects/tethne-notebooks/data/zotero')

In [ ]:
corpus.indexed_papers.items()[0:5]    # The first 10 dois in the Paper index.

In [ ]:
corpus.structuredfeatures

In [ ]:
pdf_text = corpus.structuredfeatures['pdf_text']

In [ ]:
pdf_text.features.values()[0]

In [24]:
# plt.hist([len(f) for f in pdf_text.features.values()[0]], bins=60, alpha=0.3)
plt.hist([len(f) for f in pdf_text.features.values()[1]], bins=60, alpha=0.3)
plt.hist([len(f) for f in pdf_text.features.values()[2]], bins=60, alpha=0.3)
# plt.hist([len(f) for f in pdf_text.features.values()[3]], bins=60, alpha=0.3)
plt.yscale('log')
plt.show()



In [27]:
mean([len(f) for f in pdf_text.features.values()[0]])


Out[27]:
16.144044321329641

In [26]:
mean([len(f) for f in pdf_text.features.values()[2]])


Out[26]:
5.1667772197479387

In [ ]: