In [ ]:
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('http://www.bbc.com/sport/olympics/rio-2016/medals/sports').read()
soup = BeautifulSoup(r,"lxml")
sports_span = soup.findAll("span",{"class","medals-table-by-sport__sport-name"})
sports_names = []
sports_names_format = []
for s in sports_span:
sports_names_format.append(str(s))
sports_names.append(str(s).lower().replace(" ","-")[48:-7])
print sports_names
In [ ]:
# Save html for each sport
htmls = {}
for s in sports_names:
htmls[s] = urllib.urlopen('http://www.bbc.com/sport/olympics/rio-2016/medals/sports/'+s+'#'+s).read()
In [ ]:
# Find table html for each sport
thtmls = {}
for s in sports_names:
soupsp = BeautifulSoup(htmls[s],"lxml")
thtmls[s] = soupsp.findAll("table",{"class","medals-table-by-sport__countries_table"})
In [ ]:
# For every sport, scrape medal data
import re
medal_names = ['gold','silver','bronze']
medals = {}
sports_countries = {}
all_countries_format = []
for s in sports_names:
print s
medals[s] = {}
h = str(thtmls[s])
if not thtmls[s]:
print 'no medals yet'
else:
# Find countries of interest
pattern = r"<abbr class=\"abbr-on medium-abbr-off\" title=\""
pmatch = re.finditer(pattern, h)
countries = []
for i,match in enumerate(pmatch):
country = h[int(match.end()):int(match.end())+200].rsplit('"')[0]
all_countries_format.append(country)
countries.append(country.lower().replace(" ","-"))
sports_countries[s] = countries
for c in sports_countries[s]:
if c == 'great-britain-&-n.-ireland':
ci1 = 'great-britain-and-northern-ireland'
medals[s][c] = {}
for m in medal_names:
pattern = r"<abbr class=\"abbr-on medium-abbr-off\" title=\".{,800}" + m + ".{,150}" + ci1 + "\">"
gendermatch = re.finditer(pattern, h)
for i,match in enumerate(gendermatch):
medals[s][c][m] = int(h[int(match.end()):int(match.end())+3])
else:
ci = c
medals[s][ci] = {}
for m in medal_names:
pattern = r"<abbr class=\"abbr-on medium-abbr-off\" title=\".{,500}" + m + ".{,150}" + ci + "\">"
gendermatch = re.finditer(pattern, h)
for i,match in enumerate(gendermatch):
medals[s][ci][m] = int(h[int(match.end()):int(match.end())+3])
print medals[s]
In [ ]:
import numpy as np
all_countries_format = list(np.unique(all_countries_format))
all_countries_format.remove('Great Britain & N. Ireland')
all_countries_format.append('Great Britain')
all_countries_format_list = list(np.unique(all_countries_format))
In [ ]:
import pandas as pd
# Create an empty dataframe
columns = ['country','sport','medal','N']
df = pd.DataFrame(columns=columns)
# Identify all countries with at least 1 medal
countries_list = list(set(reduce(lambda x,y: x+y,sports_countries.values())))
countries_list = sorted(countries_list)
# Fill dataframe
for s in sports_names:
if thtmls[s]:
for i,c in enumerate(countries_list):
ci = all_countries_format_list[i]
for m in medal_names:
if c in sports_countries[s]:
rowtemp = [ci, s, m, medals[s][c][m]]
else:
rowtemp = [ci, s, m, 0]
dftemp = pd.DataFrame([rowtemp], columns=columns)
df =df.append(dftemp)
In [ ]:
df.to_csv('now_medals.csv')