In [1]:
import yaml
import matplotlib.pyplot as plt
#disease = 'consumption'
filename = 'find_figures_20150616_1110_6.yml'
filename = 'datain/find_figures_20150616_1110.yml'

In [2]:
with open(filename, 'r') as f:
    results = yaml.load(f)

In [3]:
results.keys()


Out[3]:
[1792,
 1793,
 1795,
 1797,
 1800,
 1801,
 1802,
 1803,
 1804,
 1805,
 1806,
 1807,
 1809,
 1810,
 1811,
 1812,
 1813,
 1815,
 1816,
 1817,
 1818,
 1819,
 1820,
 1821,
 1822,
 1823,
 1824,
 1825,
 1827,
 1828,
 1829,
 1830,
 1831,
 1833,
 1834,
 1835,
 1836,
 1837,
 1838,
 1839,
 1840,
 1841,
 1842,
 1843,
 1844,
 1845,
 1846,
 1847,
 1848,
 1849,
 1850,
 1851,
 1852,
 1853,
 1854,
 1855,
 1856,
 1857,
 1858,
 1859,
 1860,
 1861,
 1606,
 1863,
 1864,
 1865,
 1866,
 1867,
 1868,
 1869,
 1870,
 1871,
 1872,
 1873,
 1874,
 1875,
 1876,
 1877,
 1878,
 1879,
 1880,
 1881,
 1882,
 1883,
 1884,
 1885,
 1886,
 1887,
 1888,
 1889,
 1890,
 1891,
 1892,
 1637,
 1894,
 1895,
 1896,
 1897,
 1898,
 1899,
 1900,
 1893,
 1655,
 1672,
 1678,
 1679,
 1692,
 1695,
 1862,
 1731,
 1734,
 1755,
 1763,
 1765,
 1767,
 1771,
 1772,
 1773,
 1775,
 1781,
 1785,
 1788,
 1790,
 1791]

In [4]:
import pandas as pd

In [67]:
for key, value in results.items():
    #print key, value[3]
    for page in value[3]:
        #print key, page[3]


  File "<ipython-input-67-d7e53e889adc>", line 4
    #print key, page[3]
                       ^
IndentationError: expected an indented block

In [6]:
dates=[]
percentages=[]
tobin = {}
for key, value in results.items():
    for page in value[3]:
        dates.append(key)
        percentages.append(page[3])
        if key in tobin:
            tobin[key].append(page[3])
        else:
            tobin[key] = [page[3]]

In [7]:
plt.plot(dates, percentages,'.')
plt.savefig("sizes.jpg")



In [8]:
from bokeh.plotting import figure, output_file, show

In [9]:
output_file("figures1.html", title="All points")
p = figure(title= " references", x_axis_label='Year', y_axis_label='%')
p.scatter(dates, percentages)
show(p)

In [47]:
histograms = {}
for year, percentages in tobin.items():
    histograms[year] = plt.hist(percentages, [10,20,30,40,50,60,70,80,90])



In [48]:
histograms[1850][0]


Out[48]:
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [49]:
len(histograms[1606][1])
len(histograms[year][1][1:len(histograms[year][1])])


Out[49]:
8

In [50]:
for year in histograms:
    plt.plot(histograms[year][1][1:len(histograms[year][1])], histograms[year][0]) 
legend()



In [52]:
output_file("figures_overtime.html", title="Place holder")
p = figure(title= " references", x_axis_label='%', y_axis_label='Occurrences')
for year in histograms:
    p.line(histograms[year][1][1:len(histograms[year][1])], histograms[year][0])
show(p)

Normalisation

Loading the overall publication rate in the period


In [53]:
normal_filename = 'normaliser_20150616_1844.yml'
with open('datain/' + normal_filename, 'r') as f:
    publication = yaml.load(f)

Normalise by number of pages?


In [54]:
histos_normed_page = {}
for year in publication:
    #histos_normed_word[year]
    if year in histograms:
        histos_normed_page[year] = []
        for count in histograms[year][0]:
            histos_normed_page[year].append(count/publication[year][1])

In [66]:
data = {}
for year in histos_normed_page:
    plt.plot(histograms[year][1][1:len(histograms[year][1])], histos_normed_page[year], '.-') 
plt.legend()
plt.xlim(30, 80)
plt.ylim(0, 3e-4)


Out[66]:
(0, 0.0003)