In [1]:
import bs4

In [2]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -v -d -u


Sebastian Raschka 
Last updated: 03/01/2015 

CPython 3.4.2
IPython 2.3.1



Collecting Premier League and Daily Fantasy Soccer Data

Sections



dreamteamfc.com



Getting General Player Statistics


In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [3]:
# Downloading and parsing the data into a Python dict

player_dict = {}

url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

name_list = []

for td in soup.findAll("td", { "class" : "tabName" }):
    name = td.text.split('Statistics')[-1].strip()
    if name:
        name_list.append(name)
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        position, team, vfm, value, points = res
        value = value.strip('m')
        player_dict[name] = [name, position, team, vfm, value, points]
        
print('Found: %s' % len(name_list))
print(name_list[-1])


Found: 401
O'Brien, Joey

In [4]:
# Reading the data into a pandas DataFrame

df = pd.DataFrame.from_dict(player_dict, orient='index')
df.columns = ['name', 'position', 'team', 'vfm', 'value', 'pts']
df[['vfm','value']] = df[['vfm','value']].astype(float)
df[['pts']] = df[['pts']].astype(int)
df.tail()


Out[4]:
name position team vfm value pts
Sigurdsson, Gylfi Sigurdsson, Gylfi MID SWA 28.67 3.0 86
van Ginkel, Marco van Ginkel, Marco MID CHE 0.00 2.0 0
Chamakh, Marouane Chamakh, Marouane STR CRY 2.67 1.5 4
Ince, Tom Ince, Tom MID HUL 6.50 2.0 13
Ireland, Stephen Ireland, Stephen MID STO 4.00 1.0 4

In [5]:
df.describe()


Out[5]:
vfm value pts
count 401.000000 401.000000 401.000000
mean 11.185661 2.770574 29.581047
std 10.259686 1.416327 27.582405
min -13.000000 1.000000 -13.000000
25% 3.600000 1.500000 9.000000
50% 9.330000 2.500000 24.000000
75% 15.850000 3.500000 43.000000
max 93.330000 7.500000 167.000000



Getting Injuries and Cards Information


In [6]:
df['status'] = pd.Series('', index=df.index)
df['description'] = pd.Series('', index=df.index)
df['returns'] = pd.Series('', index=df.index)

In [7]:
url = 'https://www.dreamteamfc.com/statistics/injuries-and-cards/ALL/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')

name_list = []

for td in soup.findAll("td", { "class" : "tabName2" }):
    name = td.text.split('stats')[-1].strip()
    if name:
        name_list.append(name)
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        position, team, status, description, returns = res
        df.loc[df.index==name,['status', 'description', 'returns']] = status, description, returns
        
print('Found: %s' % len(name_list))
print(name_list[-1])


Found: 81
Fernando

In [8]:
df.tail()


Out[8]:
name position team vfm value pts status description returns
Sigurdsson, Gylfi Sigurdsson, Gylfi MID SWA 28.67 3.0 86
van Ginkel, Marco van Ginkel, Marco MID CHE 0.00 2.0 0 Unavailable Joined AC Milan on season-long loan 25/05/2015
Chamakh, Marouane Chamakh, Marouane STR CRY 2.67 1.5 4 Injured Sustained in the encounter with Stoke on 13/12... 26/12/2014
Ince, Tom Ince, Tom MID HUL 6.50 2.0 13
Ireland, Stephen Ireland, Stephen MID STO 4.00 1.0 4 Doubtful Rated a doubt for 22/12 visit of Chelsea. 01/01/2015



Getting Player Form Information


In [9]:
df['month_pts'] = pd.Series(0, index=df.index)
df['week_pts'] = pd.Series(0, index=df.index)

In [11]:
url = 'https://www.dreamteamfc.com/statistics/form-guide/all'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')

name_list = []

for td in soup.findAll("td", { "class" : "tabName" }):
    name = td.text.strip()
    if name:
        name_list.append(name)
        
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        try:
            month_pts, week_pts = float(res[-2]), float(res[-1])
            df.loc[df.index==name, ['month_pts', 'week_pts']] = month_pts, week_pts
        except ValueError:
            pass
        
print('Found: %s' % len(name_list))
print(name_list[-1])


Found: 401
O'Brien, Joey

In [13]:
# Reordering the columns

df = df[['name', 'position', 'team', 'vfm', 'value', 'pts', 'month_pts', 
         'week_pts', 'status', 'description', 'returns']]

df.tail()


Out[13]:
name position team vfm value pts month_pts week_pts status description returns
Sigurdsson, Gylfi Sigurdsson, Gylfi MID SWA 28.67 3.0 86 28 5
van Ginkel, Marco van Ginkel, Marco MID CHE 0.00 2.0 0 0 0 Unavailable Joined AC Milan on season-long loan 25/05/2015
Chamakh, Marouane Chamakh, Marouane STR CRY 2.67 1.5 4 -1 0 Injured Sustained in the encounter with Stoke on 13/12... 26/12/2014
Ince, Tom Ince, Tom MID HUL 6.50 2.0 13 0 0
Ireland, Stephen Ireland, Stephen MID STO 4.00 1.0 4 -1 0 Doubtful Rated a doubt for 22/12 visit of Chelsea. 01/01/2015



Saving the Data to CSV


In [13]:
# Getting the current time stamp for the data

from datetime import datetime

url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r  = requests.get(url)
data = r.text
soup = BeautifulSoup(data)

raw_date = soup.find('li', {'class' : 'pointsupdateinfo' }).text
raw_date = raw_date.split()[-1].replace('/', '').strip()
d = datetime.strptime(raw_date, '%d%m%Y').date()
date = d.strftime('%Y%m%d')
print(date)


20141220

In [14]:
df.to_csv('../data/dreamteamfc_%s.csv' % date, index=False)



espnfc.com



Getting Team Ranks and Stats


In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [6]:
# Downloading and parsing the data into a Python dict

team_dict = {}

url = 'http://www.espnfc.com/barclays-premier-league/23/table'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

for td in soup.findAll('td', { 'class' : 'pos' }):
    rank = int(td.text)
    res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag) and i.text!='\xa0']
    team_name = res[0].strip()
    values = [int(i) for i in res[1:]]
    team_dict[team_name] = [rank] + values

Column legend:

  • Pos: POSITION
  • P: GAMES PLAYED
  • W: WINS
  • D: DRAWS
  • L: LOSSES
  • F: GOALS FOR
  • A: GOALS AGAINST
  • GD: GOAL DIFFERENCE
  • PTS: POINTS

suffixes:

  • _ov: OVERALL
  • _hm: HOME GAMES
  • _aw: AWAY GAMES

In [10]:
df = pd.DataFrame.from_dict(team_dict, orient='index')
cols = ['Pos','P_ov','W_ov','D_ov','L_ov','F_ov','A_ov',
            'W_hm','D_hm','L_hm','F_hm','A_hm', 'W_aw',
            'D_aw','L_aw','F_aw','A_aw','GD','PTS']
df.columns = cols
df = df.sort('Pos')
df['team'] = df.index
df = df[['team']+cols]
df


Out[10]:
team Pos P_ov W_ov D_ov L_ov F_ov A_ov W_hm D_hm L_hm F_hm A_hm W_aw D_aw L_aw F_aw A_aw GD PTS
Chelsea Chelsea 1 18 14 3 1 40 13 9 0 0 22 3 5 3 1 18 10 27 45
Manchester City Manchester City 2 18 13 3 2 39 15 6 1 1 15 5 7 2 1 24 10 24 42
Manchester United Manchester United 3 18 10 5 3 33 19 8 1 1 22 7 2 4 2 11 12 14 35
Southampton Southampton 4 18 10 2 6 31 14 6 1 2 21 6 4 1 4 10 8 17 32
West Ham United West Ham United 5 18 9 4 5 29 21 6 1 2 14 7 3 3 3 15 14 8 31
Arsenal Arsenal 6 18 8 6 4 32 22 5 3 1 18 10 3 3 3 14 12 10 30
Tottenham Hotspur Tottenham Hotspur 7 18 9 3 6 24 24 4 1 4 11 10 5 2 2 13 14 0 30
Swansea City Swansea City 8 18 8 4 6 23 19 6 2 2 15 7 2 2 4 8 12 4 28
Liverpool Liverpool 9 18 7 4 7 22 24 3 4 2 9 8 4 0 5 13 16 -2 25
Newcastle United Newcastle United 10 18 6 5 7 19 26 4 2 2 10 9 2 3 5 9 17 -7 23
Stoke City Stoke City 11 18 6 4 8 19 23 3 1 4 9 11 3 3 4 10 12 -4 22
Everton Everton 12 18 5 6 7 27 28 3 3 3 16 15 2 3 4 11 13 -1 21
Aston Villa Aston Villa 13 18 5 5 8 11 22 2 3 3 7 11 3 2 5 4 11 -11 20
Sunderland Sunderland 14 18 3 10 5 16 27 1 6 3 10 15 2 4 2 6 12 -11 19
West Bromwich Albion West Bromwich Albion 15 18 4 5 9 18 26 2 3 5 13 16 2 2 4 5 10 -8 17
Queens Park Rangers Queens Park Rangers 16 18 5 2 11 21 34 5 2 2 17 12 0 0 9 4 22 -13 17
Hull City Hull City 17 18 3 7 8 18 25 1 3 4 8 11 2 4 4 10 14 -7 16
Crystal Palace Crystal Palace 18 18 3 6 9 20 30 2 2 5 10 14 1 4 4 10 16 -10 15
Burnley Burnley 19 18 3 6 9 12 27 2 4 4 7 12 1 2 5 5 15 -15 15
Leicester City Leicester City 20 18 2 4 12 16 31 1 4 4 12 15 1 0 8 4 16 -15 10



Saving ESPN Data to CSV


In [12]:
df.to_csv('../data/2014_epl_day_17/espn_20141222.csv', index=False)



365stats.com



Getting Injury Data


In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [12]:
# Downloading and parsing the data into a Python dict

injury_dict = {}

url = 'http://365stats.com/football/injuries'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

for td in soup.findAll('td', { 'nowrap' : 'nowrap' }):
    name = td.text.split()
    player_info = ['%s, %s' % (' '.join(name[1:]), name[0])]
    for i in td.next_siblings:
        if isinstance(i, bs4.Tag):
            player_info.append(i.text)
    injury_dict[player_info[0]] = player_info[1:3]

In [13]:
df = pd.DataFrame.from_dict(injury_dict, orient='index')
df.columns=['injury', 'returns']
df['name'] = df.index
df.tail()


Out[13]:
injury returns name
bellerin, h Ankle/Foot Injury no date bellerin, h
smalling, c Groin/Pelvis Injury 1 week smalling, c
dzeko, e Calf/Shin Injury 3 Weeks dzeko, e
sigurdsson, g Ankle/Foot Injury no date sigurdsson, g
obertan, g Hip/Thigh Injury no date obertan, g



Saving 365stats Data to CSV


In [4]:
df.to_csv('../data/2014_epl_day_17/365stats_injury_20141222.csv')



Transfermarkt.com



Getting Home and Away Teams


In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [27]:
# Downloading and parsing the data into a Python dict   
    
url = 'http://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

# Find tab for the upcoming fixtures
tab = 'spieltagtabs-2'
div = soup.find('div', { 'id' : tab })
tit = div.findAll('a', { 'class' : 'ergebnis-link' })
if len(tit) > 0:
    tab = 'spieltagtabs-3'

# Get fixtures
home = []
away = []

div = soup.find('div', { 'id' : tab })
for t in div.findAll('td', { 'class' : 'text-right no-border-rechts no-border-links' }):
    team = t.text.strip()
    if team:
        home.append(team)
for t in div.findAll('td', { 'class' : 'no-border-links no-border-rechts' }):
    team = t.text.strip()
    if team:
        away.append(team)


df = pd.DataFrame(home, columns=['home'])
df['away'] = away
df


Out[27]:
home away
0 Stoke City Manchester Utd.
1 Aston Villa Crystal Palace
2 Hull City Everton
3 Liverpool FC Leicester City
4 Manchester City Sunderland AFC
5 Newcastle Utd. Burnley FC
6 QPR Swansea City
7 Southampton FC Arsenal FC
8 West Ham Utd. West Brom
9 Spurs Chelsea FC



Saving Home and Away Teams to CSV


In [11]:
df.to_csv('../data/2014_epl_day_19/transfermarkt_20141227.csv', index=False)



premierleague.com


In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [9]:
# Downloading and parsing the data into a Python dict

url = 'http://www.premierleague.com/en-gb/matchday.html'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

home = []
away = []

for t in soup.findAll('td', { 'width' : '30%' }):
    team = t.text.strip().split(' v ')
    print(team)


['Spurs', 'Man Utd']
['Southampton', 'Chelsea']
['Aston Villa', 'Sunderland']
['Hull', 'Leicester']
['Man City', 'Burnley']
['QPR', 'Crystal Palace']
['Stoke', 'West Brom']
['West Ham', 'Arsenal']
['Newcastle', 'Everton']
['Leicester', 'Newcastle']
['Tranmere', 'Swansea']
['West Brom', 'Gateshead']
['Liverpool', 'Chelsea']
['Paris SG', 'Chelsea']
['BSC Young Boys', 'Everton']
['Liverpool', 'Besiktas']
['Spurs', 'Fiorentina']



telegraph.co.uk



Getting Current Week Points


In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [16]:
url = 'https://fantasyfootball.telegraph.co.uk/premierleague/players/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

player_dict = {}

for t in soup.findAll('td', { 'class' : 'first' }):
    player = t.text.strip()
    player_dict[player] = []
    for s in t.next_siblings:
        if isinstance(s, bs4.Tag):
            player_dict[player].append(s.text)

# parse the player dictionary
df = pd.DataFrame.from_dict(player_dict, orient='index')

# make name column
df['name'] = df.index

# assign column names and reorder columns
df.columns = ['team', 'salary', 'pts/salary', 'week_pts', 'total_pts', 'name']
df = df[['name', 'team', 'salary', 'pts/salary', 'week_pts', 'total_pts']]

# parse data into the right format
df['salary'] = df['salary'].apply(lambda x: x.strip('£').strip(' m'))
df[['salary', 'pts/salary']] = df[['salary', 'pts/salary']].astype(float)
df[['week_pts', 'total_pts']] = df[['week_pts', 'total_pts']].astype(float)

print(df.shape)
df.tail()


(548, 6)
Out[16]:
name team salary pts/salary week_pts total_pts
Moreno, A Moreno, A Liverpool 3.8 8.7 0 33
Grealish, J Grealish, J Aston Villa 2.5 2.8 2 7
Ferguson, S Ferguson, S Newcastle United 2.8 0.0 0 0
Coleman, S Coleman, S Everton 4.1 11.5 5 47
Chamakh, M Chamakh, M Crystal Palace 3.8 8.2 0 31



Getting 6-Week Points


In [38]:
url = 'https://fantasyfootball.telegraph.co.uk/premierleague/formguide/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

df['6week_pts'] = pd.Series(0, index=df.index)

for t in soup.findAll('td', { 'class' : 'first' }):
    player = t.text.strip()
    if player:
        week6 = t.parent.find('td', { 'class' : 'sixth last' })
        df.loc[df['name'] == player, '6week_pts'] = week6.text

df.tail()


Out[38]:
name team salary pts/salary week_pts total_pts 6week_pts
Moreno, A Moreno, A Liverpool 3.8 8.7 0 33 10
Grealish, J Grealish, J Aston Villa 2.5 2.8 2 7 4
Ferguson, S Ferguson, S Newcastle United 2.8 0.0 0 0 0
Coleman, S Coleman, S Everton 4.1 11.5 5 47 13
Chamakh, M Chamakh, M Crystal Palace 3.8 8.2 0 31 8



Saving telegraph.co.uk to CSV


In [39]:
df.to_csv('../data/2014_epl_day_20/telegraph_20141229.csv', index=False)



m.premierleague.com



Combined Form of Previous 6 Days


In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [25]:
url = 'http://m.premierleague.com/en-gb/form-guide.html'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

#df['6week_pts'] = pd.Series(0, index=df.index)

team_dict = {}

for d in soup.findAll('td', { 'class' : 'col-pos' }):
    if len(team_dict) > 20:
        break
    pos = d.text
    for e in d.next_siblings:
        if isinstance(e, bs4.Tag):
            if 'class' in e.attrs and 'col-club' in e.attrs['class']:
                club = e.text
                team_dict[club] = pos
                break

df = pd.DataFrame.from_dict(team_dict, orient='index')
df.columns = ['position-last-6-games']
df.tail()


Out[25]:
position-last-6-games
Chelsea 6
Burnley 13
Leicester 17
Everton 19
Man Utd 4



Saving m.premierleague.com to CSV


In [26]:
df.to_csv('../data/2014_epl_day_20/mpremierleague_20141230.csv', index=False)