In [1]:
import bs4
In [2]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -v -d -u
In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests
In [3]:
# Downloading and parsing the data into a Python dict
player_dict = {}
url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
# Note: html5lib deals better with broken html than lxml
name_list = []
for td in soup.findAll("td", { "class" : "tabName" }):
name = td.text.split('Statistics')[-1].strip()
if name:
name_list.append(name)
res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
position, team, vfm, value, points = res
value = value.strip('m')
player_dict[name] = [name, position, team, vfm, value, points]
print('Found: %s' % len(name_list))
print(name_list[-1])
In [4]:
# Reading the data into a pandas DataFrame
df = pd.DataFrame.from_dict(player_dict, orient='index')
df.columns = ['name', 'position', 'team', 'vfm', 'value', 'pts']
df[['vfm','value']] = df[['vfm','value']].astype(float)
df[['pts']] = df[['pts']].astype(int)
df.tail()
Out[4]:
In [5]:
df.describe()
Out[5]:
In [6]:
df['status'] = pd.Series('', index=df.index)
df['description'] = pd.Series('', index=df.index)
df['returns'] = pd.Series('', index=df.index)
In [7]:
url = 'https://www.dreamteamfc.com/statistics/injuries-and-cards/ALL/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
name_list = []
for td in soup.findAll("td", { "class" : "tabName2" }):
name = td.text.split('stats')[-1].strip()
if name:
name_list.append(name)
res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
position, team, status, description, returns = res
df.loc[df.index==name,['status', 'description', 'returns']] = status, description, returns
print('Found: %s' % len(name_list))
print(name_list[-1])
In [8]:
df.tail()
Out[8]:
In [9]:
df['month_pts'] = pd.Series(0, index=df.index)
df['week_pts'] = pd.Series(0, index=df.index)
In [11]:
url = 'https://www.dreamteamfc.com/statistics/form-guide/all'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
name_list = []
for td in soup.findAll("td", { "class" : "tabName" }):
name = td.text.strip()
if name:
name_list.append(name)
res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
try:
month_pts, week_pts = float(res[-2]), float(res[-1])
df.loc[df.index==name, ['month_pts', 'week_pts']] = month_pts, week_pts
except ValueError:
pass
print('Found: %s' % len(name_list))
print(name_list[-1])
In [13]:
# Reordering the columns
df = df[['name', 'position', 'team', 'vfm', 'value', 'pts', 'month_pts',
'week_pts', 'status', 'description', 'returns']]
df.tail()
Out[13]:
In [13]:
# Getting the current time stamp for the data
from datetime import datetime
url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
raw_date = soup.find('li', {'class' : 'pointsupdateinfo' }).text
raw_date = raw_date.split()[-1].replace('/', '').strip()
d = datetime.strptime(raw_date, '%d%m%Y').date()
date = d.strftime('%Y%m%d')
print(date)
In [14]:
df.to_csv('../data/dreamteamfc_%s.csv' % date, index=False)
In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests
In [6]:
# Downloading and parsing the data into a Python dict
team_dict = {}
url = 'http://www.espnfc.com/barclays-premier-league/23/table'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
# Note: html5lib deals better with broken html than lxml
for td in soup.findAll('td', { 'class' : 'pos' }):
rank = int(td.text)
res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag) and i.text!='\xa0']
team_name = res[0].strip()
values = [int(i) for i in res[1:]]
team_dict[team_name] = [rank] + values
Column legend:
suffixes:
In [10]:
df = pd.DataFrame.from_dict(team_dict, orient='index')
cols = ['Pos','P_ov','W_ov','D_ov','L_ov','F_ov','A_ov',
'W_hm','D_hm','L_hm','F_hm','A_hm', 'W_aw',
'D_aw','L_aw','F_aw','A_aw','GD','PTS']
df.columns = cols
df = df.sort('Pos')
df['team'] = df.index
df = df[['team']+cols]
df
Out[10]:
In [12]:
df.to_csv('../data/2014_epl_day_17/espn_20141222.csv', index=False)
In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests
In [12]:
# Downloading and parsing the data into a Python dict
injury_dict = {}
url = 'http://365stats.com/football/injuries'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
# Note: html5lib deals better with broken html than lxml
for td in soup.findAll('td', { 'nowrap' : 'nowrap' }):
name = td.text.split()
player_info = ['%s, %s' % (' '.join(name[1:]), name[0])]
for i in td.next_siblings:
if isinstance(i, bs4.Tag):
player_info.append(i.text)
injury_dict[player_info[0]] = player_info[1:3]
In [13]:
df = pd.DataFrame.from_dict(injury_dict, orient='index')
df.columns=['injury', 'returns']
df['name'] = df.index
df.tail()
Out[13]:
In [4]:
df.to_csv('../data/2014_epl_day_17/365stats_injury_20141222.csv')
In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests
In [27]:
# Downloading and parsing the data into a Python dict
url = 'http://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
# Note: html5lib deals better with broken html than lxml
# Find tab for the upcoming fixtures
tab = 'spieltagtabs-2'
div = soup.find('div', { 'id' : tab })
tit = div.findAll('a', { 'class' : 'ergebnis-link' })
if len(tit) > 0:
tab = 'spieltagtabs-3'
# Get fixtures
home = []
away = []
div = soup.find('div', { 'id' : tab })
for t in div.findAll('td', { 'class' : 'text-right no-border-rechts no-border-links' }):
team = t.text.strip()
if team:
home.append(team)
for t in div.findAll('td', { 'class' : 'no-border-links no-border-rechts' }):
team = t.text.strip()
if team:
away.append(team)
df = pd.DataFrame(home, columns=['home'])
df['away'] = away
df
Out[27]:
In [11]:
df.to_csv('../data/2014_epl_day_19/transfermarkt_20141227.csv', index=False)
In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests
In [9]:
# Downloading and parsing the data into a Python dict
url = 'http://www.premierleague.com/en-gb/matchday.html'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
# Note: html5lib deals better with broken html than lxml
home = []
away = []
for t in soup.findAll('td', { 'width' : '30%' }):
team = t.text.strip().split(' v ')
print(team)
In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests
In [16]:
url = 'https://fantasyfootball.telegraph.co.uk/premierleague/players/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
# Note: html5lib deals better with broken html than lxml
player_dict = {}
for t in soup.findAll('td', { 'class' : 'first' }):
player = t.text.strip()
player_dict[player] = []
for s in t.next_siblings:
if isinstance(s, bs4.Tag):
player_dict[player].append(s.text)
# parse the player dictionary
df = pd.DataFrame.from_dict(player_dict, orient='index')
# make name column
df['name'] = df.index
# assign column names and reorder columns
df.columns = ['team', 'salary', 'pts/salary', 'week_pts', 'total_pts', 'name']
df = df[['name', 'team', 'salary', 'pts/salary', 'week_pts', 'total_pts']]
# parse data into the right format
df['salary'] = df['salary'].apply(lambda x: x.strip('£').strip(' m'))
df[['salary', 'pts/salary']] = df[['salary', 'pts/salary']].astype(float)
df[['week_pts', 'total_pts']] = df[['week_pts', 'total_pts']].astype(float)
print(df.shape)
df.tail()
Out[16]:
In [38]:
url = 'https://fantasyfootball.telegraph.co.uk/premierleague/formguide/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
# Note: html5lib deals better with broken html than lxml
df['6week_pts'] = pd.Series(0, index=df.index)
for t in soup.findAll('td', { 'class' : 'first' }):
player = t.text.strip()
if player:
week6 = t.parent.find('td', { 'class' : 'sixth last' })
df.loc[df['name'] == player, '6week_pts'] = week6.text
df.tail()
Out[38]:
In [39]:
df.to_csv('../data/2014_epl_day_20/telegraph_20141229.csv', index=False)
In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests
In [25]:
url = 'http://m.premierleague.com/en-gb/form-guide.html'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
# Note: html5lib deals better with broken html than lxml
#df['6week_pts'] = pd.Series(0, index=df.index)
team_dict = {}
for d in soup.findAll('td', { 'class' : 'col-pos' }):
if len(team_dict) > 20:
break
pos = d.text
for e in d.next_siblings:
if isinstance(e, bs4.Tag):
if 'class' in e.attrs and 'col-club' in e.attrs['class']:
club = e.text
team_dict[club] = pos
break
df = pd.DataFrame.from_dict(team_dict, orient='index')
df.columns = ['position-last-6-games']
df.tail()
Out[25]:
In [26]:
df.to_csv('../data/2014_epl_day_20/mpremierleague_20141230.csv', index=False)