In [ ]:
%pylab inline
In [ ]:
from tethne.readers import zotero
import matplotlib.pyplot as plt
In [ ]:
corpus = zotero.read('/Users/erickpeirson/Projects/tethne-notebooks/data/zotero')
In [ ]:
corpus.indexed_papers.items()[0:5] # The first 10 dois in the Paper index.
In [ ]:
corpus.structuredfeatures
In [ ]:
pdf_text = corpus.structuredfeatures['pdf_text']
In [ ]:
pdf_text.features.values()[0]
In [24]:
# plt.hist([len(f) for f in pdf_text.features.values()[0]], bins=60, alpha=0.3)
plt.hist([len(f) for f in pdf_text.features.values()[1]], bins=60, alpha=0.3)
plt.hist([len(f) for f in pdf_text.features.values()[2]], bins=60, alpha=0.3)
# plt.hist([len(f) for f in pdf_text.features.values()[3]], bins=60, alpha=0.3)
plt.yscale('log')
plt.show()
In [27]:
mean([len(f) for f in pdf_text.features.values()[0]])
Out[27]:
In [26]:
mean([len(f) for f in pdf_text.features.values()[2]])
Out[26]:
In [ ]: