In [1]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -v


Sebastian Raschka 

CPython 3.4.2
IPython 2.3.1

Parsing data from dreamteamfc.com

Sections



Getting General Player Statistics


In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

In [3]:
# Downloading and parsing the data into a Python dict

player_dict = {}

url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

name_list = []

for td in soup.findAll("td", { "class" : "tabName" }):
    name = td.text.split('Statistics')[-1].strip()
    if name:
        name_list.append(name)
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        position, team, vfm, value, points = res
        value = value.strip('m')
        player_dict[name] = [name, position, team, vfm, value, points]
        
print('Found: %s' % len(name_list))
print(name_list[-1])


Found: 401
O'Brien, Joey

In [4]:
# Reading the data into a pandas DataFrame

df = pd.DataFrame.from_dict(player_dict, orient='index')
df.columns = ['name', 'position', 'team', 'vfm', 'value', 'points']
df[['vfm','value']] = df[['vfm','value']].astype(float)
df[['points']] = df[['points']].astype(int)
df.tail()


Out[4]:
name position team vfm value points
Odemwingie, Peter Odemwingie, Peter STR STO 1.20 2.5 3
Cisse, Papiss Cisse, Papiss STR NEW 17.67 3.0 53
Duff, Michael Duff, Michael DEF BUR 18.00 1.0 18
Speroni, Julian Speroni, Julian GK CRY 10.67 1.5 16
Flamini, Mathieu Flamini, Mathieu MID ARS 14.00 1.5 21

In [5]:
df.describe()


Out[5]:
vfm value points
count 401.000000 401.000000 401.000000
mean 10.083416 2.770574 26.705736
std 9.518409 1.416327 25.338867
min -12.000000 1.000000 -12.000000
25% 2.670000 1.500000 7.000000
50% 8.500000 2.500000 21.000000
75% 14.000000 3.500000 37.000000
max 88.000000 7.500000 155.000000



Getting Injuries and Cards Information


In [6]:
df['status'] = pd.Series('', index=df.index)
df['description'] = pd.Series('', index=df.index)
df['returns'] = pd.Series('', index=df.index)

In [7]:
url = 'https://www.dreamteamfc.com/statistics/injuries-and-cards/ALL/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')

name_list = []

for td in soup.findAll("td", { "class" : "tabName2" }):
    name = td.text.split('stats')[-1].strip()
    if name:
        name_list.append(name)
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        position, team, status, description, returns = res
        df.loc[df.index==name,['status', 'description', 'returns']] = status, description, returns
        
print('Found: %s' % len(name_list))
print(name_list[-1])


Found: 84
Tadic, Dusan

In [8]:
df.tail()


Out[8]:
name position team vfm value points status description returns
Odemwingie, Peter Odemwingie, Peter STR STO 1.20 2.5 3 Injured Forced off during 30/8 game against Man City. ... Unknown
Cisse, Papiss Cisse, Papiss STR NEW 17.67 3.0 53 Doubtful Missed the Capital One Cup tie with Tottenham ... 21/12/2014
Duff, Michael Duff, Michael DEF BUR 18.00 1.0 18
Speroni, Julian Speroni, Julian GK CRY 10.67 1.5 16
Flamini, Mathieu Flamini, Mathieu MID ARS 14.00 1.5 21



Getting Player Form Information


In [9]:
df['month_points'] = pd.Series(0, index=df.index)
df['week_points'] = pd.Series(0, index=df.index)

In [10]:
url = 'https://www.dreamteamfc.com/statistics/form-guide/all'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')

name_list = []

for td in soup.findAll("td", { "class" : "tabName" }):
    name = td.text.strip()
    if name:
        name_list.append(name)
        
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        try:
            month_pts, week_pts = float(res[-2]), float(res[-1])
            df.loc[df.index==name, ['month_points', 'week_points']] = month_pts, week_pts
        except ValueError:
            pass
        
print('Found: %s' % len(name_list))
print(name_list[-1])


Found: 401
O'Brien, Joey

In [12]:
# Reordering the columns

df = df[['name', 'position', 'team', 'vfm', 'value', 'points', 'month_points', 
         'week_points', 'status', 'description', 'returns']]

# "Normalizing" player names
df['name'] = df['name'].apply(lambda x: x.lower())

df.tail()


Out[12]:
name position team vfm value points month_points week_points status description returns
Odemwingie, Peter odemwingie, peter STR STO 1.20 2.5 3 0 0 Injured Forced off during 30/8 game against Man City. ... Unknown
Cisse, Papiss cisse, papiss STR NEW 17.67 3.0 53 23 0 Doubtful Missed the Capital One Cup tie with Tottenham ... 21/12/2014
Duff, Michael duff, michael DEF BUR 18.00 1.0 18 0 0
Speroni, Julian speroni, julian GK CRY 10.67 1.5 16 6 -2
Flamini, Mathieu flamini, mathieu MID ARS 14.00 1.5 21 4 0



Saving the Data to CSV


In [13]:
# Getting the current time stamp for the data

from datetime import datetime

url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r  = requests.get(url)
data = r.text
soup = BeautifulSoup(data)

raw_date = soup.find('li', {'class' : 'pointsupdateinfo' }).text
raw_date = raw_date.split()[-1].replace('/', '').strip()
d = datetime.strptime(raw_date, '%d%m%Y').date()
date = d.strftime('%Y%m%d')
print(date)


20141220

In [14]:
df.to_csv('../data/dreamteamfc_%s.csv' % date, index=False)