In [1]:

    
import cs109style
cs109style.customize_mpl()
cs109style.customize_css()

# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt
import requests
from pattern import web









    



Setting custom matplotlib visual style
Setting custom CSS for the IPython Notebook

Fetching population data from Wikipedia

In this example we will fetch data about countries and their population from Wikipedia.

http://en.wikipedia.org/wiki/List_of_countries_by_past_and_future_population has several tables for individual countries, subcontinents as well as different years. We will combine the data for all countries and all years in a single panda dataframe and visualize the change in population for different countries.

We will go through the following steps:

fetching html with embedded data
parsing html to extract the data
collecting the data in a panda dataframe
displaying the data

To give you some starting points for your homework, we will also show the different sub-steps that can be taken to reach the presented solution.

Fetching the Wikipedia site



In [6]:

    
url = 'http://en.wikipedia.org/wiki/List_of_countries_by_past_and_future_population'
website_html = requests.get(url).text
#print website_html

Parsing html data



In [3]:

    
def get_population_html_tables(html):
    """Parse html and return html tables of wikipedia population data."""

    dom = web.Element(html)

    # 0. step: look at html source!
    
    # 1. step: get all tables
    
    # tbls = [t for t in dom.by_tag('table')]

    # 2. step: get all wikitable sortable tables (the ones with data)
    
    tbls = [t for t in dom.by_tag('table') if t.attributes['class'] == "wikitable sortable"]
    
    return tbls

tables = get_population_html_tables(website_html)
print "table length: %d" %len(tables)
for t in tables:
    print t.attributes









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-3-9b876542f223> in <module>()
     16     return tbls
     17 
---> 18 tables = get_population_html_tables(website_html)
     19 print "table length: %d" %len(tables)
     20 for t in tables:

<ipython-input-3-9b876542f223> in get_population_html_tables(html)
     12     # 2. step: get all wikitable sortable tables (the ones with data)
     13 
---> 14     tbls = [t for t in dom.by_tag('table') if t.attributes['class'] == "wikitable sortable"]
     15 
     16     return tbls

KeyError: 'class'



In [ ]:

    
def table_type(tbl):
    headers = [th.content for th in tbl.by_tag('th')]
    return headers[1]

# group the tables by type
tables_by_type = defaultdict(list)  # defaultdicts have a default value that is inserted when a new key is accessed
for tbl in tables:
    tables_by_type[table_type(tbl)].append(tbl)

print tables_by_type

Extracting data and filling it into a dictionary



In [4]:

    
def get_countries_population(tables):
    """Extract population data for countries from all tables and store it in dictionary."""
    
    result = defaultdict(dict)

    # 1. step: try to extract data for a single table

    # 2. step: iterate over all tables, extract headings and actual data and combine data into single dict
    
    for tbl in tables:
        # extract column headers    
        # each table looks a little different, therefore extract columns that store data (i.e., table header is a year)
        tbl_headers = [ th.content for th in tbl.by_tag('th')]
        column_idx_years = [(idx, int(header)) for idx, header in enumerate(tbl_headers) if header.isnumeric()]
        column_idx, column_years = zip(*column_idx_years)
        
        # extract data from table
    
        # get table rows - but skip the ones that have no td element
        tbl_rows = [ row for row in tbl.by_tag('tr') if row.by_tag('td') ]
        #print len(trs)
        #print trs[0]
    
        for row in tbl_rows:
    
            #datarow = [td.content for td in tr.by_tag('td')]
            #print datarow
        
            # get country name - 2nd td, a href, convert unicode to string
            countryname = (row.by_tag('td')[1].by_tag('a')[0].content).encode('ascii','ignore') 
            #print type(countryname)
            #print countryname
        
            # get country data - create a dictionary {1955: 10000, 1960: 14000,...}
            # extract data from the columns in column_idx; strip commas from numers; scale number to millions
            countrydata = {column_years[i]:int(row.by_tag('td')[idx].content.replace(',', ''))/1000.0 for i,idx in enumerate(column_idx) }
            #print datarow
            
            # append to dictionary
            result[countryname].update(countrydata)
    
    return result


result = get_countries_population(tables_by_type['Country or territory'])
print result









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-b963b85bb156> in <module>()
     43 
     44 
---> 45 result = get_countries_population(tables_by_type['Country or territory'])
     46 print result

NameError: name 'tables_by_type' is not defined

Creating a dataframe from a dictionary



In [8]:

    
# create dataframe

df = pd.DataFrame.from_dict(result, orient='index')
# sort based on year
df.sort(axis=1,inplace=True)
print df









    



<class 'pandas.core.frame.DataFrame'>
Index: 227 entries, Afghanistan to Zimbabwe
Data columns (total 21 columns):
1950    227  non-null values
1955    227  non-null values
1960    227  non-null values
1965    227  non-null values
1970    227  non-null values
1975    227  non-null values
1980    227  non-null values
1985    227  non-null values
1990    227  non-null values
1995    227  non-null values
2000    227  non-null values
2005    227  non-null values
2010    227  non-null values
2015    227  non-null values
2020    227  non-null values
2025    227  non-null values
2030    227  non-null values
2035    227  non-null values
2040    227  non-null values
2045    227  non-null values
2050    227  non-null values
dtypes: float64(21)

Some data accessing functions for a panda dataframe



In [9]:

    
subtable = df.iloc[0:2, 0:2]
print "subtable"
print subtable
print ""

column = df[1955]
print "column"
print column
print ""

row = df.ix[0] #row 0
print "row"
print row
print ""

rows = df.ix[:2] #rows 0,1
print "rows"
print rows
print ""

element = df.ix[0,1955] #element
print "element"
print element
print ""

# max along column
print "max"
print df[1950].max()
print ""

# axes
print "axes"
print df.axes
print ""

row = df.ix[0]
print "row info"
print row.name
print row.index
print ""

countries =  df.index
print "countries"
print countries
print ""

print "Austria"
print df.ix['Austria']









    



subtable
              1950   1955
Afghanistan  8.150  8.891
Albania      1.227  1.392

column
Afghanistan             8.891
Albania                 1.392
Algeria                 9.842
American Samoa          0.020
Andorra                 0.006
Angola                  4.423
Anguilla                0.005
Antigua and Barbuda     0.051
Argentina              18.928
Armenia                 1.565
Aruba                   0.054
Australia               9.277
Austria                 6.947
Azerbaijan              3.314
Bahamas                 0.087
...
United Arab Emirates              0.083
United Kingdom                   50.946
United States                   165.069
United States Virgin Islands      0.028
Uruguay                           2.353
Uzbekistan                        7.232
Vanuatu                           0.059
Venezuela                         6.170
Vietnam                          27.738
Wallis and Futuna                 0.007
West Bank                         0.788
Western Sahara                    0.016
Yemen                             5.265
Zambia                            2.869
Zimbabwe                          3.409
Name: 1955, Length: 227, dtype: float64

row
1950     8.150
1955     8.891
1960     9.829
1965    10.998
1970    12.431
1975    14.132
1980    15.044
1985    13.120
1990    13.568
1995    19.445
2000    22.461
2005    26.335
2010    29.121
2015    32.564
2020    36.644
2025    41.117
2030    45.665
2035    50.195
2040    54.717
2045    59.255
2050    63.795
Name: Afghanistan, dtype: float64

rows
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, Afghanistan to Albania
Data columns (total 21 columns):
1950    2  non-null values
1955    2  non-null values
1960    2  non-null values
1965    2  non-null values
1970    2  non-null values
1975    2  non-null values
1980    2  non-null values
1985    2  non-null values
1990    2  non-null values
1995    2  non-null values
2000    2  non-null values
2005    2  non-null values
2010    2  non-null values
2015    2  non-null values
2020    2  non-null values
2025    2  non-null values
2030    2  non-null values
2035    2  non-null values
2040    2  non-null values
2045    2  non-null values
2050    2  non-null values
dtypes: float64(21)

element
8.891

max
562.58

axes
[Index([u'Afghanistan', u'Albania', u'Algeria', u'American Samoa', u'Andorra', u'Angola', u'Anguilla', u'Antigua and Barbuda', u'Argentina', u'Armenia', u'Aruba', u'Australia', u'Austria', u'Azerbaijan', u'Bahamas', u'Bahrain', u'Bangladesh', u'Barbados', u'Belarus', u'Belgium', u'Belize', u'Benin', u'Bermuda', u'Bhutan', u'Bolivia', u'Bosnia and Herzegovina', u'Botswana', u'Brazil', u'British Virgin Islands', u'Brunei', u'Bulgaria', u'Burkina Faso', u'Burundi', u'Cambodia', u'Cameroon', u'Canada', u'Cape Verde', u'Cayman Islands', u'Central African Republic', u'Chad', u'Chile', u'China', u'Colombia', u'Comoros', u'Congo (Brazzaville)', u'Congo (Kinshasa)', u'Cook Islands', u'Costa Rica', u'Croatia', u'Cuba', u'Curaao', u'Cyprus', u'Czech Republic', u'Denmark', u'Djibouti', u'Dominica', u'Dominican Republic', u'Ecuador', u'Egypt', u'El Salvador', u'Equatorial Guinea', u'Eritrea', u'Estonia', u'Ethiopia', u'Faroe Islands', u'Federated States of Micronesia', u'Fiji', u'Finland', u'France', u'French Polynesia', u'Gabon', u'Gambia', u'Gaza Strip', u'Georgia', u'Germany', u'Ghana', u'Gibraltar', u'Greece', u'Greenland', u'Grenada', u'Guam', u'Guatemala', u'Guernsey', u'Guinea', u'Guinea-Bissau', u'Guyana', u'Haiti', u'Honduras', u'Hong Kong', u'Hungary', u'Iceland', u'India', u'Indonesia', u'Iran', u'Iraq', u'Ireland', u'Isle of Man', u'Israel', u'Italy', u'Ivory Coast', u'Jamaica', u'Japan', u'Jersey', u'Jordan', u'Kazakhstan', u'Kenya', u'Kiribati', u'Kuwait', u'Kyrgyzstan', u'Laos', u'Latvia', u'Lebanon', u'Lesotho', u'Liberia', u'Libya', u'Liechtenstein', u'Lithuania', u'Luxembourg', u'Macau', u'Macedonia', u'Madagascar', u'Malawi', u'Malaysia', u'Maldives', u'Mali', u'Malta', u'Marshall Islands', u'Mauritania', u'Mauritius', u'Mayotte', u'Mexico', u'Moldova', u'Monaco', u'Mongolia', u'Montenegro', u'Montserrat', u'Morocco', u'Mozambique', u'Myanmar', u'Namibia', u'Nauru', u'Nepal', u'Netherlands', u'New Caledonia', u'New Zealand', u'Nicaragua', u'Niger', u'Nigeria', u'North Korea', u'Northern Mariana Islands', u'Norway', u'Oman', u'Pakistan', u'Palau', u'Panama', u'Papua New Guinea', u'Paraguay', u'Peru', u'Philippines', u'Poland', u'Portugal', u'Puerto Rico', u'Qatar', u'Romania', u'Russia', u'Rwanda', u'Saint Barthlemy', u'Saint Helena, Ascension and Tristan da Cunha', u'Saint Kitts and Nevis', u'Saint Lucia', u'Saint Martin', u'Saint Pierre and Miquelon', u'Saint Vincent and the Grenadines', u'Samoa', u'San Marino', u'Saudi Arabia', u'Senegal', u'Serbia', u'Seychelles', u'Sierra Leone', u'Singapore', u'Sint Maarten', u'Slovakia', u'Slovenia', u'So Tom and Prncipe', u'Solomon Islands', u'Somalia', u'South Africa', u'South Korea', u'Spain', u'Sri Lanka', u'Sudan', u'Suriname', u'Swaziland', u'Sweden', u'Switzerland', u'Syria', u'Taiwan', u'Tajikistan', u'Tanzania', u'Thailand', u'Timor-Leste', u'Togo', u'Tonga', u'Trinidad and Tobago', u'Tunisia', u'Turkey', u'Turkmenistan', u'Turks and Caicos Islands', u'Tuvalu', u'Uganda', u'Ukraine', u'United Arab Emirates', u'United Kingdom', u'United States', u'United States Virgin Islands', u'Uruguay', u'Uzbekistan', u'Vanuatu', u'Venezuela', u'Vietnam', u'Wallis and Futuna', u'West Bank', u'Western Sahara', u'Yemen', u'Zambia', u'Zimbabwe'], dtype=object), Int64Index([1950, 1955, 1960, 1965, 1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020, 2025, 2030, 2035, 2040, 2045, 2050], dtype=int64)]

row info
Afghanistan
Int64Index([1950, 1955, 1960, 1965, 1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020, 2025, 2030, 2035, 2040, 2045, 2050], dtype=int64)

countries
Index([u'Afghanistan', u'Albania', u'Algeria', u'American Samoa', u'Andorra', u'Angola', u'Anguilla', u'Antigua and Barbuda', u'Argentina', u'Armenia', u'Aruba', u'Australia', u'Austria', u'Azerbaijan', u'Bahamas', u'Bahrain', u'Bangladesh', u'Barbados', u'Belarus', u'Belgium', u'Belize', u'Benin', u'Bermuda', u'Bhutan', u'Bolivia', u'Bosnia and Herzegovina', u'Botswana', u'Brazil', u'British Virgin Islands', u'Brunei', u'Bulgaria', u'Burkina Faso', u'Burundi', u'Cambodia', u'Cameroon', u'Canada', u'Cape Verde', u'Cayman Islands', u'Central African Republic', u'Chad', u'Chile', u'China', u'Colombia', u'Comoros', u'Congo (Brazzaville)', u'Congo (Kinshasa)', u'Cook Islands', u'Costa Rica', u'Croatia', u'Cuba', u'Curaao', u'Cyprus', u'Czech Republic', u'Denmark', u'Djibouti', u'Dominica', u'Dominican Republic', u'Ecuador', u'Egypt', u'El Salvador', u'Equatorial Guinea', u'Eritrea', u'Estonia', u'Ethiopia', u'Faroe Islands', u'Federated States of Micronesia', u'Fiji', u'Finland', u'France', u'French Polynesia', u'Gabon', u'Gambia', u'Gaza Strip', u'Georgia', u'Germany', u'Ghana', u'Gibraltar', u'Greece', u'Greenland', u'Grenada', u'Guam', u'Guatemala', u'Guernsey', u'Guinea', u'Guinea-Bissau', u'Guyana', u'Haiti', u'Honduras', u'Hong Kong', u'Hungary', u'Iceland', u'India', u'Indonesia', u'Iran', u'Iraq', u'Ireland', u'Isle of Man', u'Israel', u'Italy', u'Ivory Coast', u'Jamaica', u'Japan', u'Jersey', u'Jordan', u'Kazakhstan', u'Kenya', u'Kiribati', u'Kuwait', u'Kyrgyzstan', u'Laos', u'Latvia', u'Lebanon', u'Lesotho', u'Liberia', u'Libya', u'Liechtenstein', u'Lithuania', u'Luxembourg', u'Macau', u'Macedonia', u'Madagascar', u'Malawi', u'Malaysia', u'Maldives', u'Mali', u'Malta', u'Marshall Islands', u'Mauritania', u'Mauritius', u'Mayotte', u'Mexico', u'Moldova', u'Monaco', u'Mongolia', u'Montenegro', u'Montserrat', u'Morocco', u'Mozambique', u'Myanmar', u'Namibia', u'Nauru', u'Nepal', u'Netherlands', u'New Caledonia', u'New Zealand', u'Nicaragua', u'Niger', u'Nigeria', u'North Korea', u'Northern Mariana Islands', u'Norway', u'Oman', u'Pakistan', u'Palau', u'Panama', u'Papua New Guinea', u'Paraguay', u'Peru', u'Philippines', u'Poland', u'Portugal', u'Puerto Rico', u'Qatar', u'Romania', u'Russia', u'Rwanda', u'Saint Barthlemy', u'Saint Helena, Ascension and Tristan da Cunha', u'Saint Kitts and Nevis', u'Saint Lucia', u'Saint Martin', u'Saint Pierre and Miquelon', u'Saint Vincent and the Grenadines', u'Samoa', u'San Marino', u'Saudi Arabia', u'Senegal', u'Serbia', u'Seychelles', u'Sierra Leone', u'Singapore', u'Sint Maarten', u'Slovakia', u'Slovenia', u'So Tom and Prncipe', u'Solomon Islands', u'Somalia', u'South Africa', u'South Korea', u'Spain', u'Sri Lanka', u'Sudan', u'Suriname', u'Swaziland', u'Sweden', u'Switzerland', u'Syria', u'Taiwan', u'Tajikistan', u'Tanzania', u'Thailand', u'Timor-Leste', u'Togo', u'Tonga', u'Trinidad and Tobago', u'Tunisia', u'Turkey', u'Turkmenistan', u'Turks and Caicos Islands', u'Tuvalu', u'Uganda', u'Ukraine', u'United Arab Emirates', u'United Kingdom', u'United States', u'United States Virgin Islands', u'Uruguay', u'Uzbekistan', u'Vanuatu', u'Venezuela', u'Vietnam', u'Wallis and Futuna', u'West Bank', u'Western Sahara', u'Yemen', u'Zambia', u'Zimbabwe'], dtype=object)

Austria
1950    6.935
1955    6.947
1960    7.047
1965    7.271
1970    7.467
1975    7.579
1980    7.549
1985    7.560
1990    7.723
1995    8.047
2000    8.113
2005    8.185
2010    8.214
2015    8.224
2020    8.220
2025    8.190
2030    8.120
2035    8.009
2040    7.867
2045    7.702
2050    7.521
Name: Austria, dtype: float64

Plotting population of 4 countries



In [10]:

    
plotCountries = ['Austria', 'Germany', 'United States', 'France']
    
for country in plotCountries:
    row = df.ix[country]
    plt.plot(row.index, row, label=row.name ) 
    
plt.ylim(ymin=0) # start y axis at 0

plt.xticks(rotation=70)
plt.legend(loc='best')
plt.xlabel("Year")
plt.ylabel("# people (million)")
plt.title("Population of countries")









    Out[10]:





<matplotlib.text.Text at 0x10a733c10>

Plot 5 most populous countries from 2010 and 2060



In [11]:

    
def plot_populous(df, year):
    # sort table depending on data value in year column
    df_by_year = df.sort(year, ascending=False)
    
    plt.figure()
    for i in range(5):  
        row = df_by_year.ix[i]
        plt.plot(row.index, row, label=row.name ) 
            
    plt.ylim(ymin=0)
    
    plt.xticks(rotation=70)
    plt.legend(loc='best')
    plt.xlabel("Year")
    plt.ylabel("# people (million)")
    plt.title("Most populous countries in %d" % year)

plot_populous(df, 2010)
plot_populous(df, 2050)



In [ ]: