In [1]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -v
In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests
In [3]:
# Downloading and parsing the data into a Python dict
player_dict = {}
url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
# Note: html5lib deals better with broken html than lxml
name_list = []
for td in soup.findAll("td", { "class" : "tabName" }):
name = td.text.split('Statistics')[-1].strip()
if name:
name_list.append(name)
res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
position, team, vfm, value, points = res
value = value.strip('m')
player_dict[name] = [name, position, team, vfm, value, points]
print('Found: %s' % len(name_list))
print(name_list[-1])
In [4]:
# Reading the data into a pandas DataFrame
df = pd.DataFrame.from_dict(player_dict, orient='index')
df.columns = ['name', 'position', 'team', 'vfm', 'value', 'points']
df[['vfm','value']] = df[['vfm','value']].astype(float)
df[['points']] = df[['points']].astype(int)
df.tail()
Out[4]:
In [5]:
df.describe()
Out[5]:
In [6]:
df['status'] = pd.Series('', index=df.index)
df['description'] = pd.Series('', index=df.index)
df['returns'] = pd.Series('', index=df.index)
In [7]:
url = 'https://www.dreamteamfc.com/statistics/injuries-and-cards/ALL/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
name_list = []
for td in soup.findAll("td", { "class" : "tabName2" }):
name = td.text.split('stats')[-1].strip()
if name:
name_list.append(name)
res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
position, team, status, description, returns = res
df.loc[df.index==name,['status', 'description', 'returns']] = status, description, returns
print('Found: %s' % len(name_list))
print(name_list[-1])
In [8]:
df.tail()
Out[8]:
In [9]:
df['month_points'] = pd.Series(0, index=df.index)
df['week_points'] = pd.Series(0, index=df.index)
In [10]:
url = 'https://www.dreamteamfc.com/statistics/form-guide/all'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
name_list = []
for td in soup.findAll("td", { "class" : "tabName" }):
name = td.text.strip()
if name:
name_list.append(name)
res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
try:
month_pts, week_pts = float(res[-2]), float(res[-1])
df.loc[df.index==name, ['month_points', 'week_points']] = month_pts, week_pts
except ValueError:
pass
print('Found: %s' % len(name_list))
print(name_list[-1])
In [12]:
# Reordering the columns
df = df[['name', 'position', 'team', 'vfm', 'value', 'points', 'month_points',
'week_points', 'status', 'description', 'returns']]
# "Normalizing" player names
df['name'] = df['name'].apply(lambda x: x.lower())
df.tail()
Out[12]:
In [13]:
# Getting the current time stamp for the data
from datetime import datetime
url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
raw_date = soup.find('li', {'class' : 'pointsupdateinfo' }).text
raw_date = raw_date.split()[-1].replace('/', '').strip()
d = datetime.strptime(raw_date, '%d%m%Y').date()
date = d.strftime('%Y%m%d')
print(date)
In [14]:
df.to_csv('../data/dreamteamfc_%s.csv' % date, index=False)