In [1]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -v -d


Sebastian Raschka 03/01/2015 

CPython 3.4.2
IPython 2.3.1



Collecting Premier League Data

Sections



dreamteamfc.com



Getting General Player Statistics


In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [3]:
# Downloading and parsing the data into a Python dict

player_dict = {}

url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

name_list = []

for td in soup.findAll("td", { "class" : "tabName" }):
    name = td.text.split('Statistics')[-1].strip()
    if name:
        name_list.append(name)
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        position, team, vfm, value, points = res
        value = value.strip('m')
        player_dict[name] = [name, position, team, vfm, value, points]
        
print('Found: %s' % len(name_list))
print(name_list[-1])


Found: 401
O'Brien, Joey

In [4]:
# Reading the data into a pandas DataFrame

df = pd.DataFrame.from_dict(player_dict, orient='index')
df.columns = ['name', 'position', 'team', 'vfm', 'value', 'pts']
df[['vfm','value']] = df[['vfm','value']].astype(float)
df[['pts']] = df[['pts']].astype(int)
df.tail()


Out[4]:
name position team vfm value pts
Sigurdsson, Gylfi Sigurdsson, Gylfi MID SWA 28.67 3.0 86
van Ginkel, Marco van Ginkel, Marco MID CHE 0.00 2.0 0
Chamakh, Marouane Chamakh, Marouane STR CRY 2.67 1.5 4
Ince, Tom Ince, Tom MID HUL 6.50 2.0 13
Ireland, Stephen Ireland, Stephen MID STO 4.00 1.0 4

In [5]:
df.describe()


Out[5]:
vfm value pts
count 401.000000 401.000000 401.000000
mean 11.185661 2.770574 29.581047
std 10.259686 1.416327 27.582405
min -13.000000 1.000000 -13.000000
25% 3.600000 1.500000 9.000000
50% 9.330000 2.500000 24.000000
75% 15.850000 3.500000 43.000000
max 93.330000 7.500000 167.000000



Getting Injuries and Cards Information


In [6]:
df['status'] = pd.Series('', index=df.index)
df['description'] = pd.Series('', index=df.index)
df['returns'] = pd.Series('', index=df.index)

In [7]:
url = 'https://www.dreamteamfc.com/statistics/injuries-and-cards/ALL/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')

name_list = []

for td in soup.findAll("td", { "class" : "tabName2" }):
    name = td.text.split('stats')[-1].strip()
    if name:
        name_list.append(name)
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        position, team, status, description, returns = res
        df.loc[df.index==name,['status', 'description', 'returns']] = status, description, returns
        
print('Found: %s' % len(name_list))
print(name_list[-1])


Found: 81
Fernando

In [8]:
df.tail()


Out[8]:
name position team vfm value pts status description returns
Sigurdsson, Gylfi Sigurdsson, Gylfi MID SWA 28.67 3.0 86
van Ginkel, Marco van Ginkel, Marco MID CHE 0.00 2.0 0 Unavailable Joined AC Milan on season-long loan 25/05/2015
Chamakh, Marouane Chamakh, Marouane STR CRY 2.67 1.5 4 Injured Sustained in the encounter with Stoke on 13/12... 26/12/2014
Ince, Tom Ince, Tom MID HUL 6.50 2.0 13
Ireland, Stephen Ireland, Stephen MID STO 4.00 1.0 4 Doubtful Rated a doubt for 22/12 visit of Chelsea. 01/01/2015



Getting Player Form Information


In [9]:
df['month_pts'] = pd.Series(0, index=df.index)
df['week_pts'] = pd.Series(0, index=df.index)

In [11]:
url = 'https://www.dreamteamfc.com/statistics/form-guide/all'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')

name_list = []

for td in soup.findAll("td", { "class" : "tabName" }):
    name = td.text.strip()
    if name:
        name_list.append(name)
        
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        try:
            month_pts, week_pts = float(res[-2]), float(res[-1])
            df.loc[df.index==name, ['month_pts', 'week_pts']] = month_pts, week_pts
        except ValueError:
            pass
        
print('Found: %s' % len(name_list))
print(name_list[-1])


Found: 401
O'Brien, Joey

In [13]:
# Reordering the columns

df = df[['name', 'position', 'team', 'vfm', 'value', 'pts', 'month_pts', 
         'week_pts', 'status', 'description', 'returns']]

df.tail()


Out[13]:
name position team vfm value pts month_pts week_pts status description returns
Sigurdsson, Gylfi Sigurdsson, Gylfi MID SWA 28.67 3.0 86 28 5
van Ginkel, Marco van Ginkel, Marco MID CHE 0.00 2.0 0 0 0 Unavailable Joined AC Milan on season-long loan 25/05/2015
Chamakh, Marouane Chamakh, Marouane STR CRY 2.67 1.5 4 -1 0 Injured Sustained in the encounter with Stoke on 13/12... 26/12/2014
Ince, Tom Ince, Tom MID HUL 6.50 2.0 13 0 0
Ireland, Stephen Ireland, Stephen MID STO 4.00 1.0 4 -1 0 Doubtful Rated a doubt for 22/12 visit of Chelsea. 01/01/2015



Saving the Data to CSV


In [13]:
# Getting the current time stamp for the data

from datetime import datetime

url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r  = requests.get(url)
data = r.text
soup = BeautifulSoup(data)

raw_date = soup.find('li', {'class' : 'pointsupdateinfo' }).text
raw_date = raw_date.split()[-1].replace('/', '').strip()
d = datetime.strptime(raw_date, '%d%m%Y').date()
date = d.strftime('%Y%m%d')
print(date)


20141220

In [14]:
df.to_csv('../data/dreamteamfc_%s.csv' % date, index=False)



espnfc.com



Getting Team Ranks and Stats


In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [13]:
# Downloading and parsing the data into a Python dict

team_dict = {}

url = 'http://www.espnfc.com/barclays-premier-league/23/table'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

for td in soup.findAll('td', { 'class' : 'pos' }):
    rank = int(td.text)
    res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag) and i.text!='\xa0']
    team_name = res[0].strip()
    values = [int(i) for i in res[1:]]
    team_dict[team_name] = [rank] + values

Column legend:

  • Pos: POSITION
  • P: GAMES PLAYED
  • W: WINS
  • D: DRAWS
  • L: LOSSES
  • F: GOALS FOR
  • A: GOALS AGAINST
  • GD: GOAL DIFFERENCE
  • PTS: POINTS

suffixes:

  • _ov: OVERALL
  • _hm: HOME GAMES
  • _aw: AWAY GAMES

In [14]:
df = pd.DataFrame.from_dict(team_dict, orient='index')
cols = ['Pos','P_ov','W_ov','D_ov','L_ov','F_ov','A_ov',
            'W_hm','D_hm','L_hm','F_hm','A_hm', 'W_aw',
            'D_aw','L_aw','F_aw','A_aw','GD','PTS']
df.columns = cols
df = df.sort('Pos')
df['team'] = df.index
df = df[['team']+cols]
df


Out[14]:
team Pos P_ov W_ov D_ov L_ov F_ov A_ov W_hm D_hm L_hm F_hm A_hm W_aw D_aw L_aw F_aw A_aw GD PTS
Chelsea Chelsea 1 20 14 4 2 44 19 9 0 0 22 3 5 4 2 22 16 25 46
Manchester City Manchester City 2 20 14 4 2 44 19 7 2 1 20 9 7 2 1 24 10 25 46
Manchester United Manchester United 3 20 10 7 3 34 20 8 1 1 22 7 2 6 2 12 13 14 37
Southampton Southampton 4 20 11 3 6 34 15 7 2 2 24 7 4 1 4 10 8 19 36
Tottenham Hotspur Tottenham Hotspur 5 20 10 4 6 29 27 5 2 4 16 13 5 2 2 13 14 2 34
Arsenal Arsenal 6 20 9 6 5 34 25 5 3 1 18 10 4 3 4 16 15 9 33
West Ham United West Ham United 7 20 9 5 6 31 24 6 2 3 16 10 3 3 3 15 14 7 32
Liverpool Liverpool 8 20 8 5 7 28 27 4 5 2 15 11 4 0 5 13 16 1 29
Swansea City Swansea City 9 20 8 5 7 25 24 6 2 2 15 7 2 3 5 10 17 1 29
Newcastle United Newcastle United 10 20 7 6 7 25 31 5 3 2 16 14 2 3 5 9 17 -6 27
Stoke City Stoke City 11 20 7 5 8 22 24 4 2 4 12 12 3 3 4 10 12 -2 26
Aston Villa Aston Villa 12 20 5 7 8 11 22 2 5 3 7 11 3 2 5 4 11 -11 22
Everton Everton 13 20 5 6 9 29 33 3 3 3 16 15 2 3 6 13 18 -4 21
Sunderland Sunderland 14 20 3 11 6 18 30 1 6 3 10 15 2 5 3 8 15 -12 20
Hull City Hull City 15 20 4 7 9 20 26 2 3 5 10 12 2 4 4 10 14 -6 19
Queens Park Rangers Queens Park Rangers 16 20 5 4 11 22 35 5 4 2 18 13 0 0 9 4 22 -13 19
West Bromwich Albion West Bromwich Albion 17 20 4 6 10 19 29 2 3 5 13 16 2 3 5 6 13 -10 18
Crystal Palace Crystal Palace 18 20 3 8 9 20 30 2 2 5 10 14 1 6 4 10 16 -10 17
Burnley Burnley 19 20 3 8 9 17 32 2 4 4 7 12 1 4 5 10 20 -15 17
Leicester City Leicester City 20 20 3 5 12 19 33 1 4 4 12 15 2 1 8 7 18 -14 14



Saving ESPN Data to CSV


In [12]:
df.to_csv('../data/2014_epl_day_17/espn_20141222.csv', index=False)



Getting Top Scorer


In [10]:
# Downloading and parsing the data into a Python dict

player_dict = {}

url = 'http://www.espnfc.com/barclays-premier-league/23/statistics/scorers'

r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

for td in soup.findAll('td', { 'headers' : 'player' }):
    name = td.text
    team, goals = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag) and i.text!='\xa0']
    player_dict[name] = [team, int(goals)]
    
df_essc = pd.DataFrame.from_dict(player_dict, orient='index')
df_essc['name'] = df_essc.index
df_essc.columns = ['team', 'goals', 'name']
df_essc = df_essc[['name', 'team', 'goals']]
df_essc.sort('goals', ascending=False, inplace=True)
df_essc.head()


Out[10]:
name team goals
Diego Costa Diego Costa Chelsea 17
Sergio Agüero Sergio Agüero Manchester City 14
Charlie Austin Charlie Austin Queens Park Rangers 13
Alexis Sánchez Alexis Sánchez Arsenal 12
Papiss Demba Cisse Papiss Demba Cisse Newcastle United 9



Getting Top Assists


In [8]:
player_dict = {}

url = 'http://www.espnfc.com/barclays-premier-league/23/statistics/assists'

r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

for td in soup.findAll('td', { 'headers' : 'player' }):
    name = td.text
    team, assists = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag) and i.text!='\xa0']
    player_dict[name] = [team, int(assists)]
    
df_esas = pd.DataFrame.from_dict(player_dict, orient='index')
df_esas['name'] = df_esas.index
df_esas.columns = ['team', 'assists', 'name']
df_esas = df_esas[['name', 'team', 'assists']]
df_esas.sort('assists', ascending=False, inplace=True)
df_esas.head()


Out[8]:
name team assists
Cesc Fàbregas Cesc Fàbregas Chelsea 15
Gylfi Sigurdsson Gylfi Sigurdsson Swansea City 8
Leighton Baines Leighton Baines Everton 8
Stewart Downing Stewart Downing West Ham United 7
Dusan Tadic Dusan Tadic Southampton 7



365stats.com



Getting Injury Data


In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [2]:
# Downloading and parsing the data into a Python dict

injury_dict = {}

url = 'http://365stats.com/football/injuries'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

for td in soup.findAll('td', { 'nowrap' : 'nowrap' }):
    name = td.text.split()
    player_info = ['%s, %s' % (' '.join(name[1:]), name[0])]
    for i in td.next_siblings:
        if isinstance(i, bs4.Tag):
            player_info.append(i.text)
    injury_dict[player_info[0]] = player_info[1:3]

In [3]:
df = pd.DataFrame.from_dict(injury_dict, orient='index')
df.columns=['injury', 'returns']
df['name'] = df.index
df = df[['name', 'injury', 'returns']]
df.tail()


Out[3]:
name injury returns
Osman, L Osman, L Calf/Shin Injury no date
Hibbert, T Hibbert, T Muscular Injury 2 Weeks
Moses, V Moses, V Thigh Muscle Strain 2 Weeks
Taarabt, A Taarabt, A Groin/Pelvis Injury no date
Ward, S Ward, S Ankle/Foot Injury no date



Saving 365stats Data to CSV


In [4]:
df.to_csv('../data/2014_epl_day_17/365stats_injury_20141222.csv')



Transfermarkt.com



Getting Home and Away Teams


In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [2]:
# Downloading and parsing the data into a Python dict   
    
url = 'http://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1'

s = requests.Session()
s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36'
s.headers['Host'] = 'www.transfermarkt.com'
url = 'http://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1'
r = s.get(url)

soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

# Find tab for the upcoming fixtures
tab = 'spieltagtabs-2'
div = soup.find('div', { 'id' : tab })
tit = div.findAll('a', { 'class' : 'ergebnis-link' })
if len(tit) > 0:
    tab = 'spieltagtabs-3'

# Get fixtures
home = []
away = []

div = soup.find('div', { 'id' : tab })
for t in div.findAll('td', { 'class' : 'text-right no-border-rechts no-border-links' }):
    team = t.text.strip()
    if team:
        home.append(team)
for t in div.findAll('td', { 'class' : 'no-border-links no-border-rechts' }):
    team = t.text.strip()
    if team:
        away.append(team)


df = pd.DataFrame(home, columns=['home'])
df['away'] = away
df


Out[2]:
home away
0 Norwich Manchester City
1 Manchester Utd. Crystal Palace
2 Bournemouth Swansea
3 Stoke City Southampton FC
4 West Ham Watford
5 Sunderland Everton
6 Arsenal FC West Brom
7 Aston Villa Spurs
8 Liverpool Chelsea FC
9 Leicester City Newcastle



Saving Home and Away Teams to CSV


In [11]:
df.to_csv('../data/2014_epl_day_19/transfermarkt_20141227.csv', index=False)



premierleague.com


In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [9]:
# Downloading and parsing the data into a Python dict

url = 'http://www.premierleague.com/en-gb/matchday.html'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

home = []
away = []

for t in soup.findAll('td', { 'width' : '30%' }):
    team = t.text.strip().split(' v ')
    print(team)


['Spurs', 'Man Utd']
['Southampton', 'Chelsea']
['Aston Villa', 'Sunderland']
['Hull', 'Leicester']
['Man City', 'Burnley']
['QPR', 'Crystal Palace']
['Stoke', 'West Brom']
['West Ham', 'Arsenal']
['Newcastle', 'Everton']
['Leicester', 'Newcastle']
['Tranmere', 'Swansea']
['West Brom', 'Gateshead']
['Liverpool', 'Chelsea']
['Paris SG', 'Chelsea']
['BSC Young Boys', 'Everton']
['Liverpool', 'Besiktas']
['Spurs', 'Fiorentina']



telegraph.co.uk



Getting Current Week Points


In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [15]:
url = 'https://fantasyfootball.telegraph.co.uk/premierleague/players/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

player_dict = {}

for t in soup.findAll('td', { 'class' : 'first' }):
    player = t.text.strip()
    player_dict[player] = []
    for s in t.next_siblings:
        if isinstance(s, bs4.Tag):
            player_dict[player].append(s.text)

# parse the player dictionary
df = pd.DataFrame.from_dict(player_dict, orient='index')

# make name column
df['name'] = df.index

# assign column names and reorder columns
df.columns = ['team', 'salary', 'pts/salary', 'week_pts', 'total_pts', 'name']
df = df[['name', 'team', 'salary', 'pts/salary', 'week_pts', 'total_pts']]

# parse data into the right format
df['salary'] = df['salary'].apply(lambda x: x.strip('£').strip(' m'))
df[['salary', 'pts/salary']] = df[['salary', 'pts/salary']].astype(float)
df[['week_pts', 'total_pts']] = df[['week_pts', 'total_pts']].astype(int)

print(df.shape)
df.tail()


(548, 6)
Out[15]:
name team salary pts/salary week_pts total_pts
Januzaj, A Januzaj, A Manchester United 3.9 3.8 1 15
Grealish, J Grealish, J Aston Villa 2.5 4.0 3 10
Anichebe, V Anichebe, V West Bromwich Albion 4.0 8.2 10 33
Hibbert, T Hibbert, T Everton 2.2 3.2 0 7
Coutinho, P Coutinho, P Liverpool 4.4 11.1 2 49



Getting 6-Week Points


In [16]:
url = 'https://fantasyfootball.telegraph.co.uk/premierleague/formguide/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

df['6week_pts'] = pd.Series(0, index=df.index)

for t in soup.findAll('td', { 'class' : 'first' }):
    player = t.text.strip()
    if player:
        week6 = t.parent.find('td', { 'class' : 'sixth last' })
        df.loc[df['name'] == player, '6week_pts'] = week6.text

df.tail()


Out[16]:
name team salary pts/salary week_pts total_pts 6week_pts
Januzaj, A Januzaj, A Manchester United 3.9 3.8 1 15 2
Grealish, J Grealish, J Aston Villa 2.5 4.0 3 10 7
Anichebe, V Anichebe, V West Bromwich Albion 4.0 8.2 10 33 16
Hibbert, T Hibbert, T Everton 2.2 3.2 0 7 4
Coutinho, P Coutinho, P Liverpool 4.4 11.1 2 49 27



Saving telegraph.co.uk to CSV


In [39]:
df.to_csv('../data/2014_epl_day_20/telegraph_20141229.csv', index=False)



m.premierleague.com



Combined Form of Previous 6 Days


In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [3]:
url = 'http://m.premierleague.com/en-gb/form-guide.html'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

#df['6week_pts'] = pd.Series(0, index=df.index)

team_dict = {}

for d in soup.findAll('td', { 'class' : 'col-pos' }):
    if len(team_dict) > 20:
        break
    pos = d.text
    for e in d.next_siblings:
        if isinstance(e, bs4.Tag):
            if 'class' in e.attrs and 'col-club' in e.attrs['class']:
                club = e.text
                team_dict[club] = pos
                break

df = pd.DataFrame.from_dict(team_dict, orient='index')
        
df.columns = ['position-last-6-games']
df['team'] = df.index
df.tail()


Out[3]:
position-last-6-games team
West Brom 15 West Brom
Hull 12 Hull
Southampton 9 Southampton
Newcastle 11 Newcastle
Crystal Palace 18 Crystal Palace



Saving m.premierleague.com to CSV


In [26]:
df.to_csv('../data/2014_epl_day_20/mpremierleague_20141230.csv', index=False)



fantasyfootballscout.co.uk



Predicted Line-Ups


In [31]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [85]:
url = 'http://www.fantasyfootballscout.co.uk/team-news/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml


team_dict = {}

for li in soup.findAll('li'):
    for h2 in li.findAll('h2'):
        team = h2.text
        team_dict[team] = []
        for p in li.findAll('span', { 'class' : 'player-name' }):
            player = p.text
            team_dict[team].append(player)
                    
df = pd.DataFrame.from_dict(team_dict)
df.tail()


Out[85]:
Arsenal Aston Villa Burnley Chelsea Crystal Palace Everton Hull City Leicester City Liverpool Manchester City Manchester United Newcastle United Queens Park Rangers Southampton Stoke City Sunderland Swansea City Tottenham Hotspur West Bromwich Albion West Ham United
6 Flamini Sánchez Jones Matic McArthur Barry Meyler Mahrez Lucas Fernandinho Carrick Tioté Henry Wanyama Whelan Johnson Ki Bentaleb Gardner Nolan
7 Coquelin Cleverley Marney Willian Joe Ledley Barkley Livermore Drinkwater Moreno Y Touré Rooney Ayoze Pérez Barton Davis Mame Biram Diouf Larsson Dyer Lamela Morrison Noble
8 Oxlade-Chamberlain N'Zogbia Boyd Oscar Bolasie Naismith Brady James Coutinho Nasri Young Sissoko Fer Mane Walters Gómez Sigurdsson Eriksen Brunt Downing
9 Sánchez Benteke Barnes Hazard Zaha Mirallas Aluko Vardy Sterling Silva Falcao Gouffran Zamora Tadic Arnautovic Wickham Routledge Chadli Dorrans Sakho
10 Cazorla Agbonlahor Ings Diego Costa Campbell Lukaku Jelavic Ulloa Lallana Jovetic van Persie Armstrong Austin Pellè Crouch Fletcher Bony Kane Berahino Carroll



Saving fantasyfootballscout.co.uk to CSV


In [86]:
df.to_csv('../data/epl_1314_21/fantasyfootballscout.csv', index=False)