In [2]:
from datetime import datetime

from lxml import html
import requests

import numpy as np
import pandas as pd
import matplotlib.pylab as plt

pd.options.display.max_columns=50

In [3]:
def print_element(element):
    print "<%s %s>%s ..." % (element.tag, element.attrib, element.text_content()[:200].replace("\n", " "))

In [4]:
page = requests.get('http://en.wikipedia.org/wiki/List_of_Nobel_laureates')
tree = html.fromstring(page.text)
print_element(tree)


<html {'lang': 'en', 'class': 'client-nojs', 'dir': 'ltr'}>   List of Nobel laureates - Wikipedia, the free encyclopedia document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" ); window.RLQ  ...

In [5]:
tables = tree.xpath('//table')
for table in tables:
    print_element(table)


<table {'class': 'wikitable sortable'}>  Year Physics Chemistry Physiology or Medicine Literature Peace Economics   1901 Röntgen, WilhelmWilhelm Röntgen Hoff, Jacobus Henricus van 'tJacobus Henricus van 't Hoff von Behring, Emil AdolfEmil  ...
<table {'style': 'border:1px solid #aaa;background-color:#f9f9f9', 'class': 'mbox-small plainlinks sistersitebox'}>   Wikimedia Commons has media related to Nobel laureates.   ...
<table {'style': 'border-spacing:0', 'class': 'navbox'}>        v t e   Nobel Prizes       Prizes    Chemistry Economics1 Literature Peace Physics Physiology or Medicine         Laureates     by subject    Chemistry Economics Literature Peace Physics Physi ...
<table {'style': 'border-spacing:0;background:transparent;color:inherit', 'class': 'nowraplinks hlist collapsible collapsed navbox-inner'}>     v t e   Nobel Prizes       Prizes    Chemistry Economics1 Literature Peace Physics Physiology or Medicine         Laureates     by subject    Chemistry Economics Literature Peace Physics Physiolo ...
<table {'style': 'border-spacing:0', 'class': 'nowraplinks navbox-subgroup'}>  by subject    Chemistry Economics Literature Peace Physics Physiology or Medicine         by criterion    Black African Asian Arab Latino and Hispanic Slavic Female         by country    Argentine A ...

In [6]:
table = tree.xpath('//table[@class="wikitable sortable"]')[0]
print_element(table)


<table {'class': 'wikitable sortable'}>  Year Physics Chemistry Physiology or Medicine Literature Peace Economics   1901 Röntgen, WilhelmWilhelm Röntgen Hoff, Jacobus Henricus van 'tJacobus Henricus van 't Hoff von Behring, Emil AdolfEmil  ...

In [7]:
subjects = [subject[0].text_content().replace("\n"," ") for subject in table.xpath('tr')[0][1:]]
subjects


Out[7]:
['Physics',
 'Chemistry',
 'Physiology or Medicine',
 'Literature',
 'Peace',
 'Economics']

In [8]:
years = [item[0].text for item in table.xpath('tr')[1:-1]]

In [9]:
for index, item in enumerate(table.xpath('tr')[1][1:]):
    subject = subjects[index]
    print "%s:" % subject
    for winner in item.xpath('span[@class="vcard"]/span/a'):
        winner_name = winner.attrib["title"]
        winner_url = winner.attrib["href"]
        print " - %s" % winner_name


Physics:
 - Wilhelm Röntgen
Chemistry:
 - Jacobus Henricus van 't Hoff
Physiology or Medicine:
 - Emil Adolf von Behring
Literature:
 - Sully Prudhomme
Peace:
 - Henry Dunant
 - Frédéric Passy
Economics:

In [10]:
year_list = []
subject_list = []
name_list = []
url_list = []
for y_index, year in enumerate(years):
    #print year
    for index, item in enumerate(table.xpath('tr')[y_index + 1][1:]):
        subject = subjects[index]
        #print "%s:" % subject
        for winner in item.xpath('span[@class="vcard"]/span/a'):
            winner_name = winner.attrib["title"]
            winner_url = winner.attrib["href"]
            #print " - %s" % winner_name
            year_list.append(year)
            subject_list.append(subject)
            name_list.append(winner_name)
            url_list.append(winner_url)

In [11]:
data_set = pd.DataFrame(name_list, columns=["winner_name"])
data_set["subject"] = subject_list
data_set["year"] = year_list
data_set["year"] = data_set["year"].astype(np.int32)
data_set["url"] = url_list
data_set.head(5)


Out[11]:
winner_name subject year url
0 Wilhelm Röntgen Physics 1901 /wiki/Wilhelm_R%C3%B6ntgen
1 Jacobus Henricus van 't Hoff Chemistry 1901 /wiki/Jacobus_Henricus_van_%27t_Hoff
2 Emil Adolf von Behring Physiology or Medicine 1901 /wiki/Emil_Adolf_von_Behring
3 Sully Prudhomme Literature 1901 /wiki/Sully_Prudhomme
4 Henry Dunant Peace 1901 /wiki/Henry_Dunant

In [12]:
years_df = data_set["year"].value_counts().sort_index()
years_df


Out[12]:
1901     6
1902     7
1903     7
1904     5
1905     5
1906     6
1907     6
1908     7
1909     7
1910     4
1911     6
1912     6
1913     5
1914     3
1915     4
1916     1
1917     3
1918     2
1919     4
1920     5
1921     5
1922     6
1923     5
1924     3
1925     6
1926     6
1927     7
1928     4
1929     7
1930     5
        ..
1986    11
1987     9
1988    11
1989    10
1990    11
1991     7
1992     7
1993    11
1994    12
1995    11
1996    13
1997    11
1998    12
1999     6
2000    13
2001    14
2002    13
2003    11
2004    12
2005    12
2006     8
2007    11
2008    12
2009    13
2010    11
2011    13
2012    10
2013    13
2014    13
2015    11
dtype: int64

In [15]:
plt.figure(figsize=(15,5))
plt.plot(years_df.index, years_df.values, linewidth=2, alpha=.6)
plt.grid()
plt.xlabel("Year")
plt.ylabel("Number of Prizes")
plt.show();
print "Total Prizes: %s" % len(data_set)


Total Prizes: 877

In [16]:
plt.figure(figsize=(13,5))

for subject in subjects:
    df = data_set[data_set["subject"]==subject]["year"].value_counts().sort_index().cumsum()
    plt.plot(df.index, df, label=subject, linewidth=2, alpha=.6)


plt.grid()
plt.legend(loc="best")
plt.xlabel("Year")
plt.ylabel("Cumulative Sum of Given Nobel Prizes")
plt.xticks(np.arange(1900, 2020, 10))

plt.show();

I. és II. vh. hatása


In [17]:
plt.figure(figsize=(13,5))

for subject in subjects:
    df = data_set[(data_set["subject"]==subject) &
                  (data_set["year"].astype(np.int32)<1950)]["year"].value_counts().sort_index().cumsum()
    plt.plot(df.index, df, label=subject, linewidth=2, alpha=.6)

plt.grid()
plt.legend(loc="best")
plt.xlabel("Year")
plt.ylabel("Cumulative Sum of Given Nobel Prizes")
plt.xticks(np.arange(1900, 1950, 5))

gca = plt.gca()

gca.add_patch(plt.Rectangle((1914,0), 4, 60, alpha=.3, color="orange"))
gca.add_patch(plt.Rectangle((1939,0), (45-39), 60, alpha=.3, color="orange"))

plt.annotate(s="WW I", xy=(1915,55))
plt.annotate(s="WW II", xy=(1941,55))
plt.show();

In [ ]: