In [1]:
from pandas import *
import pandas as pd
import numpy as np
import re
import plotly.plotly as py
import pandas as pd
from pandas import DataFrame, Series
import plotly
plotly.offline.init_notebook_mode()
from bs4 import BeautifulSoup
import urllib2
from pylab import *



In [3]:
def removeAnnotations(frame, col):
    return frame[col].map(lambda x: re.sub('(\[[0-9]+\])+', '',x))

page = urllib2.urlopen('https://en.wikipedia.org/wiki/List_of_countries_by_traffic-related_death_rate')
soup = BeautifulSoup(page)
htmlRows = soup.select('#mw-content-text > table.wikitable > tr')[2:]
len(htmlRows)
tableCells = [tag.find_all('td') for tag in htmlRows]


df = DataFrame(
    {'Country' : [rowCells[0].a.text for rowCells in tableCells],
     'CountryId' : [rowCells[0].a['href'] for rowCells in tableCells],
     'Fatals' : [rowCells[1].text for rowCells in tableCells],
    })

df.Fatals = removeAnnotations(df, 'Fatals')
df.Country = df.Country.map(lambda x: x.encode('ascii', 'ignore') )

In [4]:
page = urllib2.urlopen('https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3')
isoSoup = BeautifulSoup(page)


codeTable = isoSoup.select('#mw-content-text > table')[0]

links = codeTable.find_all('a')
countryCodes = [link.parent.parent.span.text.encode('ascii', 'ignore') for link in links]
countryHrefs = [link.attrs['href'] for link in links] 

countryDf = DataFrame({'CountryId': countryHrefs, 'Code': countryCodes})
countryDf


adjustments = [
    ('/wiki/United_States_of_America', '/wiki/United_States'), 
]

for old, new in adjustments:
    countryDf.CountryId.replace(old,new, inplace=True )

countryDf.to_csv('countries.csv')

In [7]:
dfWithCodes = pd.merge(df, countryDf, on='CountryId',  how='left')
dfWithCodes.Code[123] = 'PSE'

dfWithCodes.to_csv('fatalities.csv', index=False)