In [1]:

    
%load_ext watermark
%watermark -a 'Sebastian Raschka' -v









    



Sebastian Raschka 

CPython 3.4.2
IPython 2.3.1

Parsing data from dreamteamfc.com

Sections

Getting General Player Statistics
Getting Injuries and Cards Information
Getting Player Form Information
Saving the Data to CSV

Getting General Player Statistics

[back to top]



In [2]:

    
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests



In [3]:

    
# Downloading and parsing the data into a Python dict

player_dict = {}

url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

name_list = []

for td in soup.findAll("td", { "class" : "tabName" }):
    name = td.text.split('Statistics')[-1].strip()
    if name:
        name_list.append(name)
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        position, team, vfm, value, points = res
        value = value.strip('m')
        player_dict[name] = [name, position, team, vfm, value, points]
        
print('Found: %s' % len(name_list))
print(name_list[-1])









    



Found: 401
O'Brien, Joey



In [4]:

    
# Reading the data into a pandas DataFrame

df = pd.DataFrame.from_dict(player_dict, orient='index')
df.columns = ['name', 'position', 'team', 'vfm', 'value', 'points']
df[['vfm','value']] = df[['vfm','value']].astype(float)
df[['points']] = df[['points']].astype(int)
df.tail()









    Out[4]:






  
    
      
      name
      position
      team
      vfm
      value
      points
    
  
  
    
      Odemwingie, Peter
       Odemwingie, Peter
       STR
       STO
        1.20
       2.5
        3
    
    
      Cisse, Papiss
           Cisse, Papiss
       STR
       NEW
       17.67
       3.0
       53
    
    
      Duff, Michael
           Duff, Michael
       DEF
       BUR
       18.00
       1.0
       18
    
    
      Speroni, Julian
         Speroni, Julian
        GK
       CRY
       10.67
       1.5
       16
    
    
      Flamini, Mathieu
        Flamini, Mathieu
       MID
       ARS
       14.00
       1.5
       21



In [5]:

    
df.describe()

Getting Injuries and Cards Information

[back to top]



In [6]:

    
df['status'] = pd.Series('', index=df.index)
df['description'] = pd.Series('', index=df.index)
df['returns'] = pd.Series('', index=df.index)



In [7]:

    
url = 'https://www.dreamteamfc.com/statistics/injuries-and-cards/ALL/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')

name_list = []

for td in soup.findAll("td", { "class" : "tabName2" }):
    name = td.text.split('stats')[-1].strip()
    if name:
        name_list.append(name)
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        position, team, status, description, returns = res
        df.loc[df.index==name,['status', 'description', 'returns']] = status, description, returns
        
print('Found: %s' % len(name_list))
print(name_list[-1])









    



Found: 84
Tadic, Dusan



In [8]:

    
df.tail()









    Out[8]:






  
    
      
      name
      position
      team
      vfm
      value
      points
      status
      description
      returns
    
  
  
    
      Odemwingie, Peter
       Odemwingie, Peter
       STR
       STO
        1.20
       2.5
        3
        Injured
       Forced off during 30/8 game against Man City. ...
          Unknown
    
    
      Cisse, Papiss
           Cisse, Papiss
       STR
       NEW
       17.67
       3.0
       53
       Doubtful
       Missed the Capital One Cup tie with Tottenham ...
       21/12/2014
    
    
      Duff, Michael
           Duff, Michael
       DEF
       BUR
       18.00
       1.0
       18
               
                                                        
                 
    
    
      Speroni, Julian
         Speroni, Julian
        GK
       CRY
       10.67
       1.5
       16
               
                                                        
                 
    
    
      Flamini, Mathieu
        Flamini, Mathieu
       MID
       ARS
       14.00
       1.5
       21

Getting Player Form Information

[back to top]



In [9]:

    
df['month_points'] = pd.Series(0, index=df.index)
df['week_points'] = pd.Series(0, index=df.index)



In [10]:

    
url = 'https://www.dreamteamfc.com/statistics/form-guide/all'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')

name_list = []

for td in soup.findAll("td", { "class" : "tabName" }):
    name = td.text.strip()
    if name:
        name_list.append(name)
        
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        try:
            month_pts, week_pts = float(res[-2]), float(res[-1])
            df.loc[df.index==name, ['month_points', 'week_points']] = month_pts, week_pts
        except ValueError:
            pass
        
print('Found: %s' % len(name_list))
print(name_list[-1])









    



Found: 401
O'Brien, Joey



In [12]:

    
# Reordering the columns

df = df[['name', 'position', 'team', 'vfm', 'value', 'points', 'month_points', 
         'week_points', 'status', 'description', 'returns']]

# "Normalizing" player names
df['name'] = df['name'].apply(lambda x: x.lower())

df.tail()









    Out[12]:






  
    
      
      name
      position
      team
      vfm
      value
      points
      month_points
      week_points
      status
      description
      returns
    
  
  
    
      Odemwingie, Peter
       odemwingie, peter
       STR
       STO
        1.20
       2.5
        3
        0
       0
        Injured
       Forced off during 30/8 game against Man City. ...
          Unknown
    
    
      Cisse, Papiss
           cisse, papiss
       STR
       NEW
       17.67
       3.0
       53
       23
       0
       Doubtful
       Missed the Capital One Cup tie with Tottenham ...
       21/12/2014
    
    
      Duff, Michael
           duff, michael
       DEF
       BUR
       18.00
       1.0
       18
        0
       0
               
                                                        
                 
    
    
      Speroni, Julian
         speroni, julian
        GK
       CRY
       10.67
       1.5
       16
        6
      -2
               
                                                        
                 
    
    
      Flamini, Mathieu
        flamini, mathieu
       MID
       ARS
       14.00
       1.5
       21
        4
       0

Saving the Data to CSV

[back to top]



In [13]:

    
# Getting the current time stamp for the data

from datetime import datetime

url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r  = requests.get(url)
data = r.text
soup = BeautifulSoup(data)

raw_date = soup.find('li', {'class' : 'pointsupdateinfo' }).text
raw_date = raw_date.split()[-1].replace('/', '').strip()
d = datetime.strptime(raw_date, '%d%m%Y').date()
date = d.strftime('%Y%m%d')
print(date)

20141220



In [14]:

    
df.to_csv('../data/dreamteamfc_%s.csv' % date, index=False)

	vfm	value	points
count	401.000000	401.000000	401.000000
mean	10.083416	2.770574	26.705736
std	9.518409	1.416327	25.338867
min	-12.000000	1.000000	-12.000000
25%	2.670000	1.500000	7.000000
50%	8.500000	2.500000	21.000000
75%	14.000000	3.500000	37.000000
max	88.000000	7.500000	155.000000

	name	position	team	vfm	value	points
Odemwingie, Peter	Odemwingie, Peter	STR	STO	1.20	2.5	3
Cisse, Papiss	Cisse, Papiss	STR	NEW	17.67	3.0	53
Duff, Michael	Duff, Michael	DEF	BUR	18.00	1.0	18
Speroni, Julian	Speroni, Julian	GK	CRY	10.67	1.5	16
Flamini, Mathieu	Flamini, Mathieu	MID	ARS	14.00	1.5	21