In [27]:
import yaml
import matplotlib.pyplot as plt
In [28]:
filename = 'librarians_20150616_1846.yml'
In [29]:
with open("datain/" + filename, 'r') as f:
results = yaml.load(f)
Look at the number of books in each year that contain the keyword ("Librarian")
In [30]:
bookCount = {}
for year, data in results.items():
bookCount[year] = len(data)
In [32]:
plt.plot(bookCount.keys(), bookCount.values())
xlim(1750, 1890)
xlabel('Year')
ylabel('Number of books in which the keyword appears')
title('Occurences of the word \'Librarian\' in the corpus')
plt.savefig('pix/librarian books.png')
Look at the number of occurrences per year
In [33]:
wordCount = {}
for year, data in results.items():
wordCount[year] = 0
for datum in data:
wordCount[year] += len(datum[3])
...and plot the two on the same axes, so we can compare number of occurences and the number of books in which the phrase appears.
In [34]:
plt.plot(bookCount.keys(), bookCount.values())
plt.plot(wordCount.keys(), wordCount.values())
xlim(1750, 1890)
xlabel('Year')
ylabel('Counts')
legend(['Number of Books', 'Number of Occurrences'], loc=2)
title('Occurrences of the word \'Librarian\'')
plt.savefig('pix/librarian books and occurrences.png')
Just out of curiosity, which is growing faster? Our graph shows a fairly constant relationship of mentions/books
In [36]:
plt.plot(bookCount.values(), wordCount.values(), '.')
xlabel('Number of books featuring the keyword')
ylabel('Number of times the keyword appears')
Out[36]:
In [37]:
normal_filename = 'normaliser_20150616_1844.yml'
with open('datain/' + normal_filename, 'r') as f:
publication = yaml.load(f)
Divide the year on year book counts to find the proportion of books mentioning the keyword
In [38]:
normalised_per_book = {}
for year, bookc in bookCount.items():
if year>0:
normalised_per_book[year] = float(bookc)/publication[year][0]
In [39]:
plt.plot(normalised_per_book.keys(), normalised_per_book.values())
plt.ylim(0, 0.2)
Out[39]:
Normalise the number of occurrences by the number of words
In [40]:
occ_per_word = {}
word_per_year = {}
for year, occ in wordCount.items():
if year>0:
occ_per_word[year] = float(occ)/publication[year][2]
word_per_year[year] = publication[year][2]/1000000.0
In [41]:
plt.plot(wordCount.keys(), wordCount.values())
plt.plot(word_per_year.keys(), word_per_year.values())
xlim(1800, 1890)
legend(['number of occurences', 'millions of words published'], loc=2)
title('Occurrences of the word \'Librarian\'')
plt.savefig('pix/librarian occurrences.png')
xlabel('Year')
Out[41]:
Now use Bokeh to create an interactive, zoomable version of the above figure
In [44]:
from bokeh.plotting import figure, output_file, show
In [43]:
output_file("pix/Librarians.html", title="Number of occurrences of the word 'Librarians'")
p = figure(title= "Number of occurrences of the word 'Librarians'", x_axis_label='Year')
colors = ['red', 'green', 'blue', 'black']
p.line(wordCount.keys(), wordCount.values(), legend = 'Number of Occurrences', color='blue')
p.line(word_per_year.keys(), word_per_year.values(), legend = 'Millions of words published', color='red')
p.legend.orientation = "top_left"
show(p)
Finally, plot the number of times the keyword appears by year, normalised by the number of words in the corpus for that year:
In [45]:
plt.plot(occ_per_word.keys(), occ_per_word.values())
xlim(1750, 1890)
ylim(0, 0.00001)
xlabel('Year')
ylabel('Occurrences/word')
Out[45]:
Many of these figures are saved in the output folder "pix", but some are shown just in this notebook.