In [1]:
import cs109style
cs109style.customize_mpl()
cs109style.customize_css()

# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt
import requests
from pattern import web


Setting custom matplotlib visual style
Setting custom CSS for the IPython Notebook

Fetching population data from Wikipedia

In this example we will fetch data about countries and their population from Wikipedia.

http://en.wikipedia.org/wiki/List_of_countries_by_past_and_future_population has several tables for individual countries, subcontinents as well as different years. We will combine the data for all countries and all years in a single panda dataframe and visualize the change in population for different countries.

We will go through the following steps:

  • fetching html with embedded data
  • parsing html to extract the data
  • collecting the data in a panda dataframe
  • displaying the data

To give you some starting points for your homework, we will also show the different sub-steps that can be taken to reach the presented solution.

Fetching the Wikipedia site


In [2]:
url = 'http://en.wikipedia.org/wiki/List_of_countries_by_past_and_future_population'
website_html = requests.get(url).text
#print website_html

Parsing html data


In [3]:
def get_population_html_tables(html):
    """Parse html and return html tables of wikipedia population data."""

    dom = web.Element(html)

    ### 0. step: look at html source!
    #### 1. step: get all tables
    #tbls = dom('table') # <-- This gets us all the tables
    # Only get the tables we need
    # But we need only the tables with class "sortable wikitable"
    tbls = dom.by_class('sortable wikitable')
    #### 2. step: get all tables we care about

    return tbls

tables = get_population_html_tables(website_html)
print "table length: %d" %len(tables)
for t in tables:
    print t.attributes


table length: 6
{u'style': u'text-align: right', u'class': u'sortable wikitable'}
{u'style': u'text-align: right', u'class': u'sortable wikitable'}
{u'style': u'text-align: right', u'class': u'sortable wikitable'}
{u'style': u'text-align: right', u'class': u'sortable wikitable'}
{u'style': u'text-align: right', u'class': u'sortable wikitable'}
{u'style': u'text-align: right', u'class': u'sortable wikitable'}

In [4]:
def table_type(tbl):
    ### Extract the table type
    return tbl('th')[0].content  # The second header type and content (what we need)
    #for th in tbl('th'):
        #print th.content
    #return 'table'

    
# Method 1
# group the tables by type
#table_by_type = {}
#for tbl in tables:
#    typ = table_type(tbl)
#    if typ not in table_by_type:
#        table_by_type[typ] = list()
#    tables_by_type[typ].append(tbl)

    
# Method 2
# Equivalent code below

# tables_by_type = defaultdict(list) # To group in lists in dictionaries
# defaultdicts have a default value that is inserted when a new key is accessed
# So, in the loop below, if it has not encountered the table type
# It inserts an empty list
# This is why the "defaultdict" is used !

# group the tables by type
tables_by_type = defaultdict(list)  
# defaultdicts have a default value that is inserted when a new key is accessed
for tbl in tables:
    tables_by_type[table_type(tbl)].append(tbl)

    # Shows country or territory has 3 tables
print tables_by_type


defaultdict(<type 'list'>, {u'Country or territory': [Element(tag=u'table'), Element(tag=u'table'), Element(tag=u'table')], u'(Sub)continent': [Element(tag=u'table'), Element(tag=u'table'), Element(tag=u'table')]})

Extracting data and filling it into a dictionary


In [17]:
def get_countries_population(tables):
    """Extract population data for countries from all tables and store it in dictionary."""
    
    result = defaultdict(dict)

    for tbl in tables:
        # 1. step: try to extract data for a single table
        tbl = tables[0]
        # From the fist table (tbl) get the headers
        # I just need the rownames, and the years columns
        # From the Wikipedia URL
        # I do not need the % growth columns
        headers = tbl('tr')
        first_header = headers[0]
        th_s = first_header('th')
        # Extracting the columns that has years only
        years = [int(val.content) for val in th_s if val.content.isnumeric()]
        # Enumerate produces a pair - the index and the value
        # Enumerate can be applied to lists, and other iterable objects
        year_indices = [idx for idx, val in enumerate(th_s) if val.content.isnumeric()]
        #table_headers = tbl('tr')[0]('th')
        print years
        print year_indices
        #years = [int(th.content)]
        # 2. step: iterate over all tables, extract headings and actual data and combine data into single dict
        rows = tbl('tr')[1:]
        for row in rows:
            tds = row('td')
            country_name = tds[0]('a')[0].content
            population_by_year = [int(tds[colidx].content.replace(',','')) for 
                                  colidx in year_indices]
            # zip creates a sequences of tuples
            subdict = dict(zip(years, population_by_year))
            result[country_name].update(subdict)
    return result


result = get_countries_population(tables_by_type['Country or territory'])
print len(result)
print result[u'Canada']


[1950, 1955, 1960, 1965, 1970, 1975, 1980]
[1, 2, 4, 6, 8, 10, 12]
[1950, 1955, 1960, 1965, 1970, 1975, 1980]
[1, 2, 4, 6, 8, 10, 12]
[1950, 1955, 1960, 1965, 1970, 1975, 1980]
[1, 2, 4, 6, 8, 10, 12]
227
{1955: 16050, 1960: 18267, 1965: 20071, 1970: 21750, 1975: 23209, 1980: 24593, 1950: 14011}

In [ ]:
# When you get an error
# You can just insert a cell below the traceback with %debug
# And it will take you to the debugger
#%debug

Creating a dataframe from a dictionary


In [20]:
# create dataframe

df = pd.DataFrame.from_dict(result, orient='index')
# sort based on year
df.sort(axis=1,inplace=True)
print df


<class 'pandas.core.frame.DataFrame'>
Index: 227 entries, Afghanistan to Zimbabwe
Data columns (total 7 columns):
1950    227  non-null values
1955    227  non-null values
1960    227  non-null values
1965    227  non-null values
1970    227  non-null values
1975    227  non-null values
1980    227  non-null values
dtypes: int64(7)

Some data accessing functions for a panda dataframe


In [21]:
subtable = df.iloc[0:2, 0:2]
print "subtable"
print subtable
print ""

column = df[1955]
print "column"
print column
print ""

row = df.ix[0] #row 0
print "row"
print row
print ""

rows = df.ix[:2] #rows 0,1
print "rows"
print rows
print ""

element = df.ix[0,1955] #element
print "element"
print element
print ""

# max along column
print "max"
print df[1950].max()
print ""

# axes
print "axes"
print df.axes
print ""

row = df.ix[0]
print "row info"
print row.name
print row.index
print ""

countries =  df.index
print "countries"
print countries
print ""

print "Austria"
print df.ix['Austria']


subtable
             1950  1955
Afghanistan  8150  8891
Albania      1227  1392

column
Afghanistan             8891
Albania                 1392
Algeria                 9842
American Samoa            20
Andorra                    6
Angola                  4423
Anguilla                   5
Antigua and Barbuda       51
Argentina              18928
Armenia                 1565
Aruba                     54
Australia               9277
Austria                 6947
Azerbaijan              3314
Bahamas                   87
...
United Arab Emirates                83
United Kingdom                   50946
United States                   165069
United States Virgin Islands        28
Uruguay                           2353
Uzbekistan                        7232
Vanuatu                             59
Venezuela                         6170
Vietnam                          27738
Wallis and Futuna                    7
West Bank                          788
Western Sahara                      16
Yemen                             5265
Zambia                            2869
Zimbabwe                          3409
Name: 1955, Length: 227, dtype: int64

row
1950     8150
1955     8891
1960     9829
1965    10998
1970    12431
1975    14132
1980    15044
Name: Afghanistan, dtype: int64

rows
             1950  1955  1960   1965   1970   1975   1980
Afghanistan  8150  8891  9829  10998  12431  14132  15044
Albania      1227  1392  1623   1884   2157   2401   2671

element
8891

max
562580

axes
[Index([u'Afghanistan', u'Albania', u'Algeria', u'American Samoa', u'Andorra', u'Angola', u'Anguilla', u'Antigua and Barbuda', u'Argentina', u'Armenia', u'Aruba', u'Australia', u'Austria', u'Azerbaijan', u'Bahamas', u'Bahrain', u'Bangladesh', u'Barbados', u'Belarus', u'Belgium', u'Belize', u'Benin', u'Bermuda', u'Bhutan', u'Bolivia', u'Bosnia and Herzegovina', u'Botswana', u'Brazil', u'British Virgin Islands', u'Brunei', u'Bulgaria', u'Burkina Faso', u'Burma', u'Burundi', u'Cambodia', u'Cameroon', u'Canada', u'Cape Verde', u'Cayman Islands', u'Central African Republic', u'Chad', u'Chile', u'China', u'Colombia', u'Comoros', u'Congo (Brazzaville)', u'Congo (Kinshasa)', u'Cook Islands', u'Costa Rica', u'Croatia', u'Cuba', u'Curaçao', u'Cyprus', u'Czech Republic', u'Denmark', u'Djibouti', u'Dominica', u'Dominican Republic', u'Ecuador', u'Egypt', u'El Salvador', u'Equatorial Guinea', u'Eritrea', u'Estonia', u'Ethiopia', u'Faroe Islands', u'Federated States of Micronesia', u'Fiji', u'Finland', u'France', u'French Polynesia', u'Gabon', u'Gambia', u'Gaza Strip', u'Georgia', u'Germany', u'Ghana', u'Gibraltar', u'Greece', u'Greenland', u'Grenada', u'Guam', u'Guatemala', u'Guernsey', u'Guinea', u'Guinea-Bissau', u'Guyana', u'Haiti', u'Honduras', u'Hong Kong', u'Hungary', u'Iceland', u'India', u'Indonesia', u'Iran', u'Iraq', u'Ireland', u'Isle of Man', u'Israel', u'Italy', u'Ivory Coast', u'Jamaica', u'Japan', u'Jersey', u'Jordan', u'Kazakhstan', u'Kenya', u'Kiribati', u'Kuwait', u'Kyrgyzstan', u'Laos', u'Latvia', u'Lebanon', u'Lesotho', u'Liberia', u'Libya', u'Liechtenstein', u'Lithuania', u'Luxembourg', u'Macau', u'Macedonia', u'Madagascar', u'Malawi', u'Malaysia', u'Maldives', u'Mali', u'Malta', u'Marshall Islands', u'Mauritania', u'Mauritius', u'Mayotte', u'Mexico', u'Moldova', u'Monaco', u'Mongolia', u'Montenegro', u'Montserrat', u'Morocco', u'Mozambique', u'Namibia', u'Nauru', u'Nepal', u'Netherlands', u'New Caledonia', u'New Zealand', u'Nicaragua', u'Niger', u'Nigeria', u'North Korea', u'Northern Mariana Islands', u'Norway', u'Oman', u'Pakistan', u'Palau', u'Panama', u'Papua New Guinea', u'Paraguay', u'Peru', u'Philippines', u'Poland', u'Portugal', u'Puerto Rico', u'Qatar', u'Romania', u'Russia', u'Rwanda', u'Saint Barthélemy', u'Saint Helena, Ascension and Tristan da Cunha', u'Saint Kitts and Nevis', u'Saint Lucia', u'Saint Martin', u'Saint Pierre and Miquelon', u'Saint Vincent and the Grenadines', u'Samoa', u'San Marino', u'Saudi Arabia', u'Senegal', u'Serbia', u'Seychelles', u'Sierra Leone', u'Singapore', u'Sint Maarten', u'Slovakia', u'Slovenia', u'Solomon Islands', u'Somalia', u'South Africa', u'South Korea', u'Spain', u'Sri Lanka', u'Sudan', u'Suriname', u'Swaziland', u'Sweden', u'Switzerland', u'Syria', u'São Tomé and Príncipe', u'Taiwan', u'Tajikistan', u'Tanzania', u'Thailand', u'Timor-Leste', u'Togo', u'Tonga', u'Trinidad and Tobago', u'Tunisia', u'Turkey', u'Turkmenistan', u'Turks and Caicos Islands', u'Tuvalu', u'Uganda', u'Ukraine', u'United Arab Emirates', u'United Kingdom', u'United States', u'United States Virgin Islands', u'Uruguay', u'Uzbekistan', u'Vanuatu', u'Venezuela', u'Vietnam', u'Wallis and Futuna', u'West Bank', u'Western Sahara', u'Yemen', u'Zambia', u'Zimbabwe'], dtype=object), Int64Index([1950, 1955, 1960, 1965, 1970, 1975, 1980], dtype=int64)]

row info
Afghanistan
Int64Index([1950, 1955, 1960, 1965, 1970, 1975, 1980], dtype=int64)

countries
Index([u'Afghanistan', u'Albania', u'Algeria', u'American Samoa', u'Andorra', u'Angola', u'Anguilla', u'Antigua and Barbuda', u'Argentina', u'Armenia', u'Aruba', u'Australia', u'Austria', u'Azerbaijan', u'Bahamas', u'Bahrain', u'Bangladesh', u'Barbados', u'Belarus', u'Belgium', u'Belize', u'Benin', u'Bermuda', u'Bhutan', u'Bolivia', u'Bosnia and Herzegovina', u'Botswana', u'Brazil', u'British Virgin Islands', u'Brunei', u'Bulgaria', u'Burkina Faso', u'Burma', u'Burundi', u'Cambodia', u'Cameroon', u'Canada', u'Cape Verde', u'Cayman Islands', u'Central African Republic', u'Chad', u'Chile', u'China', u'Colombia', u'Comoros', u'Congo (Brazzaville)', u'Congo (Kinshasa)', u'Cook Islands', u'Costa Rica', u'Croatia', u'Cuba', u'Curaçao', u'Cyprus', u'Czech Republic', u'Denmark', u'Djibouti', u'Dominica', u'Dominican Republic', u'Ecuador', u'Egypt', u'El Salvador', u'Equatorial Guinea', u'Eritrea', u'Estonia', u'Ethiopia', u'Faroe Islands', u'Federated States of Micronesia', u'Fiji', u'Finland', u'France', u'French Polynesia', u'Gabon', u'Gambia', u'Gaza Strip', u'Georgia', u'Germany', u'Ghana', u'Gibraltar', u'Greece', u'Greenland', u'Grenada', u'Guam', u'Guatemala', u'Guernsey', u'Guinea', u'Guinea-Bissau', u'Guyana', u'Haiti', u'Honduras', u'Hong Kong', u'Hungary', u'Iceland', u'India', u'Indonesia', u'Iran', u'Iraq', u'Ireland', u'Isle of Man', u'Israel', u'Italy', u'Ivory Coast', u'Jamaica', u'Japan', u'Jersey', u'Jordan', u'Kazakhstan', u'Kenya', u'Kiribati', u'Kuwait', u'Kyrgyzstan', u'Laos', u'Latvia', u'Lebanon', u'Lesotho', u'Liberia', u'Libya', u'Liechtenstein', u'Lithuania', u'Luxembourg', u'Macau', u'Macedonia', u'Madagascar', u'Malawi', u'Malaysia', u'Maldives', u'Mali', u'Malta', u'Marshall Islands', u'Mauritania', u'Mauritius', u'Mayotte', u'Mexico', u'Moldova', u'Monaco', u'Mongolia', u'Montenegro', u'Montserrat', u'Morocco', u'Mozambique', u'Namibia', u'Nauru', u'Nepal', u'Netherlands', u'New Caledonia', u'New Zealand', u'Nicaragua', u'Niger', u'Nigeria', u'North Korea', u'Northern Mariana Islands', u'Norway', u'Oman', u'Pakistan', u'Palau', u'Panama', u'Papua New Guinea', u'Paraguay', u'Peru', u'Philippines', u'Poland', u'Portugal', u'Puerto Rico', u'Qatar', u'Romania', u'Russia', u'Rwanda', u'Saint Barthélemy', u'Saint Helena, Ascension and Tristan da Cunha', u'Saint Kitts and Nevis', u'Saint Lucia', u'Saint Martin', u'Saint Pierre and Miquelon', u'Saint Vincent and the Grenadines', u'Samoa', u'San Marino', u'Saudi Arabia', u'Senegal', u'Serbia', u'Seychelles', u'Sierra Leone', u'Singapore', u'Sint Maarten', u'Slovakia', u'Slovenia', u'Solomon Islands', u'Somalia', u'South Africa', u'South Korea', u'Spain', u'Sri Lanka', u'Sudan', u'Suriname', u'Swaziland', u'Sweden', u'Switzerland', u'Syria', u'São Tomé and Príncipe', u'Taiwan', u'Tajikistan', u'Tanzania', u'Thailand', u'Timor-Leste', u'Togo', u'Tonga', u'Trinidad and Tobago', u'Tunisia', u'Turkey', u'Turkmenistan', u'Turks and Caicos Islands', u'Tuvalu', u'Uganda', u'Ukraine', u'United Arab Emirates', u'United Kingdom', u'United States', u'United States Virgin Islands', u'Uruguay', u'Uzbekistan', u'Vanuatu', u'Venezuela', u'Vietnam', u'Wallis and Futuna', u'West Bank', u'Western Sahara', u'Yemen', u'Zambia', u'Zimbabwe'], dtype=object)

Austria
1950    6935
1955    6947
1960    7047
1965    7271
1970    7467
1975    7579
1980    7549
Name: Austria, dtype: int64

Plotting population of 4 countries


In [22]:
plotCountries = ['Austria', 'Germany', 'United States', 'France']
    
for country in plotCountries:
    row = df.ix[country]
    plt.plot(row.index, row, label=row.name ) 
    
plt.ylim(ymin=0) # start y axis at 0

plt.xticks(rotation=70)
plt.legend(loc='best')
plt.xlabel("Year")
plt.ylabel("# people (million)")
plt.title("Population of countries")


Out[22]:
<matplotlib.text.Text at 0x107a6e610>

Plot 5 most populous countries from 2010 and 2060


In [23]:
def plot_populous(df, year):
    # sort table depending on data value in year column
    df_by_year = df.sort(year, ascending=False)
    
    plt.figure()
    for i in range(5):  
        row = df_by_year.ix[i]
        plt.plot(row.index, row, label=row.name ) 
            
    plt.ylim(ymin=0)
    
    plt.xticks(rotation=70)
    plt.legend(loc='best')
    plt.xlabel("Year")
    plt.ylabel("# people (million)")
    plt.title("Most populous countries in %d" % year)

plot_populous(df, 2010)
plot_populous(df, 2050)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-23-3d84f5b7c3ce> in <module>()
     16     plt.title("Most populous countries in %d" % year)
     17 
---> 18 plot_populous(df, 2010)
     19 plot_populous(df, 2050)

<ipython-input-23-3d84f5b7c3ce> in plot_populous(df, year)
      1 def plot_populous(df, year):
      2     # sort table depending on data value in year column
----> 3     df_by_year = df.sort(year, ascending=False)
      4 
      5     plt.figure()

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in sort(self, columns, column, axis, ascending, inplace)
   3203             columns = column
   3204         return self.sort_index(by=columns, axis=axis, ascending=ascending,
-> 3205                                inplace=inplace)
   3206 
   3207     def sort_index(self, axis=0, by=None, ascending=True, inplace=False,

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in sort_index(self, axis, by, ascending, inplace, kind)
   3265             else:
   3266                 by = by[0]
-> 3267                 k = self[by].values
   3268                 if k.ndim == 2:
   3269                     raise ValueError('Cannot sort by duplicate column %s'

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2001             # get column
   2002             if self.columns.is_unique:
-> 2003                 return self._get_item_cache(key)
   2004 
   2005             # duplicate columns

/usr/local/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
    665             return cache[item]
    666         except Exception:
--> 667             values = self._data.get(item)
    668             res = self._box_item_values(item, values)
    669             cache[item] = res

/usr/local/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item)
   1653     def get(self, item):
   1654         if self.items.is_unique:
-> 1655             _, block = self._find_block(item)
   1656             return block.get(item)
   1657         else:

/usr/local/lib/python2.7/site-packages/pandas/core/internals.pyc in _find_block(self, item)
   1933 
   1934     def _find_block(self, item):
-> 1935         self._check_have(item)
   1936         for i, block in enumerate(self.blocks):
   1937             if item in block:

/usr/local/lib/python2.7/site-packages/pandas/core/internals.pyc in _check_have(self, item)
   1940     def _check_have(self, item):
   1941         if item not in self.items:
-> 1942             raise KeyError('no item named %s' % com.pprint_thing(item))
   1943 
   1944     def reindex_axis(self, new_axis, method=None, axis=0, copy=True):

KeyError: u'no item named 2010'

In [ ]:
%debug


> /usr/local/lib/python2.7/site-packages/pandas/core/internals.py(1942)_check_have()
   1941         if item not in self.items:
-> 1942             raise KeyError('no item named %s' % com.pprint_thing(item))
   1943 


In [23]:


In [ ]: