In [1]:
    
import yaml
import matplotlib.pyplot as plt
#disease = 'consumption'
filename = 'find_figures_20150616_1110_6.yml'
filename = 'datain/find_figures_20150616_1110.yml'
    
In [2]:
    
with open(filename, 'r') as f:
    results = yaml.load(f)
    
In [3]:
    
results.keys()
    
    Out[3]:
In [4]:
    
import pandas as pd
    
In [67]:
    
for key, value in results.items():
    #print key, value[3]
    for page in value[3]:
        #print key, page[3]
    
    
In [6]:
    
dates=[]
percentages=[]
tobin = {}
for key, value in results.items():
    for page in value[3]:
        dates.append(key)
        percentages.append(page[3])
        if key in tobin:
            tobin[key].append(page[3])
        else:
            tobin[key] = [page[3]]
    
In [7]:
    
plt.plot(dates, percentages,'.')
plt.savefig("sizes.jpg")
    
    
In [8]:
    
from bokeh.plotting import figure, output_file, show
    
In [9]:
    
output_file("figures1.html", title="All points")
p = figure(title= " references", x_axis_label='Year', y_axis_label='%')
p.scatter(dates, percentages)
show(p)
    
In [47]:
    
histograms = {}
for year, percentages in tobin.items():
    histograms[year] = plt.hist(percentages, [10,20,30,40,50,60,70,80,90])
    
    
In [48]:
    
histograms[1850][0]
    
    Out[48]:
In [49]:
    
len(histograms[1606][1])
len(histograms[year][1][1:len(histograms[year][1])])
    
    Out[49]:
In [50]:
    
for year in histograms:
    plt.plot(histograms[year][1][1:len(histograms[year][1])], histograms[year][0]) 
legend()
    
    
In [52]:
    
output_file("figures_overtime.html", title="Place holder")
p = figure(title= " references", x_axis_label='%', y_axis_label='Occurrences')
for year in histograms:
    p.line(histograms[year][1][1:len(histograms[year][1])], histograms[year][0])
show(p)
    
In [53]:
    
normal_filename = 'normaliser_20150616_1844.yml'
with open('datain/' + normal_filename, 'r') as f:
    publication = yaml.load(f)
    
Normalise by number of pages?
In [54]:
    
histos_normed_page = {}
for year in publication:
    #histos_normed_word[year]
    if year in histograms:
        histos_normed_page[year] = []
        for count in histograms[year][0]:
            histos_normed_page[year].append(count/publication[year][1])
    
In [66]:
    
data = {}
for year in histos_normed_page:
    plt.plot(histograms[year][1][1:len(histograms[year][1])], histos_normed_page[year], '.-') 
plt.legend()
plt.xlim(30, 80)
plt.ylim(0, 3e-4)
    
    Out[66]: