In [1]:
from pandas import *
import pandas as pd
import numpy as np
import re
import plotly.plotly as py
import pandas as pd
from pandas import DataFrame, Series
import plotly
plotly.offline.init_notebook_mode()
from bs4 import BeautifulSoup
import urllib2
from pylab import *
In [3]:
def removeAnnotations(frame, col):
return frame[col].map(lambda x: re.sub('(\[[0-9]+\])+', '',x))
page = urllib2.urlopen('https://en.wikipedia.org/wiki/List_of_countries_by_traffic-related_death_rate')
soup = BeautifulSoup(page)
htmlRows = soup.select('#mw-content-text > table.wikitable > tr')[2:]
len(htmlRows)
tableCells = [tag.find_all('td') for tag in htmlRows]
df = DataFrame(
{'Country' : [rowCells[0].a.text for rowCells in tableCells],
'CountryId' : [rowCells[0].a['href'] for rowCells in tableCells],
'Fatals' : [rowCells[1].text for rowCells in tableCells],
})
df.Fatals = removeAnnotations(df, 'Fatals')
df.Country = df.Country.map(lambda x: x.encode('ascii', 'ignore') )
In [4]:
page = urllib2.urlopen('https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3')
isoSoup = BeautifulSoup(page)
codeTable = isoSoup.select('#mw-content-text > table')[0]
links = codeTable.find_all('a')
countryCodes = [link.parent.parent.span.text.encode('ascii', 'ignore') for link in links]
countryHrefs = [link.attrs['href'] for link in links]
countryDf = DataFrame({'CountryId': countryHrefs, 'Code': countryCodes})
countryDf
adjustments = [
('/wiki/United_States_of_America', '/wiki/United_States'),
]
for old, new in adjustments:
countryDf.CountryId.replace(old,new, inplace=True )
countryDf.to_csv('countries.csv')
In [7]:
dfWithCodes = pd.merge(df, countryDf, on='CountryId', how='left')
dfWithCodes.Code[123] = 'PSE'
dfWithCodes.to_csv('fatalities.csv', index=False)