Data Exploration

This notebook will perform exploratory analysis on the european soccer dataset before new feature creation.

Additional exploration of new features is located within the feature creation notebook.



In [200]:

    
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

First step is to read in the csv files created by the extraction notebook



In [201]:

    
matches = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_matches.csv')
print len(matches)
print matches.head()









    



3040
     id  country_id  league_id     season  stage          date  match_api_id  \
0  1729        1729       1729  2008/2009      1  8/17/08 0:00        489042   
1  1730        1729       1729  2008/2009      1  8/16/08 0:00        489043   
2  1731        1729       1729  2008/2009      1  8/16/08 0:00        489044   
3  1732        1729       1729  2008/2009      1  8/16/08 0:00        489045   
4  1733        1729       1729  2008/2009      1  8/17/08 0:00        489046   

   home_team_api_id  away_team_api_id  home_team_goal  ...      SJA   VCH  \
0             10260             10261               1  ...    10.00  1.28   
1              9825              8659               1  ...    12.00  1.25   
2              8472              8650               0  ...     1.73  5.50   
3              8654              8528               2  ...     3.75  1.90   
4             10252              8456               4  ...     3.75  1.90   

   VCD    VCA   GBH   GBD   GBA   BSH   BSD    BSA  
0  5.5  12.00  1.30  4.75  10.0  1.29  4.50  11.00  
1  6.0  13.00  1.22  5.50  13.0  1.22  5.00  13.00  
2  3.8   1.65  5.00  3.40   1.7  4.50  3.40   1.73  
3  3.5   4.35  1.91  3.25   4.0  1.91  3.25   3.80  
4  3.5   4.35  1.91  3.25   4.0  1.91  3.30   3.75  

[5 rows x 115 columns]



In [202]:

    
matches.columns[:11]  # Columns 1 - 10 identify the match and the number of goals scored by each team









    Out[202]:





Index([u'id', u'country_id', u'league_id', u'season', u'stage', u'date',
       u'match_api_id', u'home_team_api_id', u'away_team_api_id',
       u'home_team_goal', u'away_team_goal'],
      dtype='object')



In [203]:

    
matches.columns[85:]   # Columns 85 - 115 are betting odds from different websites









    Out[203]:





Index([u'B365H', u'B365D', u'B365A', u'BWH', u'BWD', u'BWA', u'IWH', u'IWD',
       u'IWA', u'LBH', u'LBD', u'LBA', u'PSH', u'PSD', u'PSA', u'WHH', u'WHD',
       u'WHA', u'SJH', u'SJD', u'SJA', u'VCH', u'VCD', u'VCA', u'GBH', u'GBD',
       u'GBA', u'BSH', u'BSD', u'BSA'],
      dtype='object')



In [204]:

    
matches.columns[11:55]  # Columns 11-55 are (X,Y) coordinates for players on the pitch - Describing formation









    Out[204]:





Index([u'home_player_X1', u'home_player_X2', u'home_player_X3',
       u'home_player_X4', u'home_player_X5', u'home_player_X6',
       u'home_player_X7', u'home_player_X8', u'home_player_X9',
       u'home_player_X10', u'home_player_X11', u'away_player_X1',
       u'away_player_X2', u'away_player_X3', u'away_player_X4',
       u'away_player_X5', u'away_player_X6', u'away_player_X7',
       u'away_player_X8', u'away_player_X9', u'away_player_X10',
       u'away_player_X11', u'home_player_Y1', u'home_player_Y2',
       u'home_player_Y3', u'home_player_Y4', u'home_player_Y5',
       u'home_player_Y6', u'home_player_Y7', u'home_player_Y8',
       u'home_player_Y9', u'home_player_Y10', u'home_player_Y11',
       u'away_player_Y1', u'away_player_Y2', u'away_player_Y3',
       u'away_player_Y4', u'away_player_Y5', u'away_player_Y6',
       u'away_player_Y7', u'away_player_Y8', u'away_player_Y9',
       u'away_player_Y10', u'away_player_Y11'],
      dtype='object')



In [205]:

    
matches.columns[55:85] # Columns 55 - 77 give the player names. Columns 77-84 give some statistics based on the match.









    Out[205]:





Index([u'home_player_1', u'home_player_2', u'home_player_3', u'home_player_4',
       u'home_player_5', u'home_player_6', u'home_player_7', u'home_player_8',
       u'home_player_9', u'home_player_10', u'home_player_11',
       u'away_player_1', u'away_player_2', u'away_player_3', u'away_player_4',
       u'away_player_5', u'away_player_6', u'away_player_7', u'away_player_8',
       u'away_player_9', u'away_player_10', u'away_player_11', u'goal',
       u'shoton', u'shotoff', u'foulcommit', u'card', u'cross', u'corner',
       u'possession'],
      dtype='object')

Lets remove any variables from matches df that we won't need for this analysis



In [206]:

    
matches_reduced = matches.copy()
removecols = matches.columns[11:85]
removecols_other = ['country_id','league_id']



In [207]:

    
for col in matches_reduced.columns:
    if col in removecols or col in removecols_other:
        del matches_reduced[col]



In [208]:

    
print matches_reduced.shape   #Reduced from 115 columns to 106 columns



In [209]:

    
matches_reduced.season.value_counts()   #Equal numer of matches per-season









    Out[209]:





2009/2010    380
2013/2014    380
2010/2011    380
2011/2012    380
2012/2013    380
2014/2015    380
2008/2009    380
2015/2016    380
Name: season, dtype: int64



In [210]:

    
# What does the 'stage' variable mean?
matches_reduced[matches_reduced.season=='2008/2009'].stage.value_counts()









    Out[210]:





38    10
10    10
17    10
16    10
15    10
14    10
13    10
12    10
11    10
9     10
37    10
8     10
7     10
6     10
5     10
4     10
3     10
2     10
18    10
19    10
20    10
21    10
36    10
35    10
34    10
33    10
32    10
31    10
30    10
29    10
28    10
27    10
26    10
25    10
24    10
23    10
22    10
1     10
Name: stage, dtype: int64

'Stage' variable must mean 'week' of the season. Each 'stage' consists of 10 matches. This is a way to group matches by date.



In [211]:

    
matches_reduced.head()









    Out[211]:






  
    
      
      id
      season
      stage
      date
      match_api_id
      home_team_api_id
      away_team_api_id
      home_team_goal
      away_team_goal
      B365H
      ...
      SJA
      VCH
      VCD
      VCA
      GBH
      GBD
      GBA
      BSH
      BSD
      BSA
    
  
  
    
      0
      1729
      2008/2009
      1
      8/17/08 0:00
      489042
      10260
      10261
      1
      1
      1.29
      ...
      10.00
      1.28
      5.5
      12.00
      1.30
      4.75
      10.0
      1.29
      4.50
      11.00
    
    
      1
      1730
      2008/2009
      1
      8/16/08 0:00
      489043
      9825
      8659
      1
      0
      1.20
      ...
      12.00
      1.25
      6.0
      13.00
      1.22
      5.50
      13.0
      1.22
      5.00
      13.00
    
    
      2
      1731
      2008/2009
      1
      8/16/08 0:00
      489044
      8472
      8650
      0
      1
      5.50
      ...
      1.73
      5.50
      3.8
      1.65
      5.00
      3.40
      1.7
      4.50
      3.40
      1.73
    
    
      3
      1732
      2008/2009
      1
      8/16/08 0:00
      489045
      8654
      8528
      2
      1
      1.91
      ...
      3.75
      1.90
      3.5
      4.35
      1.91
      3.25
      4.0
      1.91
      3.25
      3.80
    
    
      4
      1733
      2008/2009
      1
      8/17/08 0:00
      489046
      10252
      8456
      4
      2
      1.91
      ...
      3.75
      1.90
      3.5
      4.35
      1.91
      3.25
      4.0
      1.91
      3.30
      3.75
    
  

5 rows × 39 columns

Now let's check for missing values



In [212]:

    
null_dict = {}

for col in matches_reduced.columns[4:]:
    nulls = matches_reduced[col].isnull().sum()
    if nulls > 0:
        null_dict[col] = nulls

null_dict









    Out[212]:





{'BSA': 1140,
 'BSD': 1140,
 'BSH': 1140,
 'BWA': 1,
 'BWD': 1,
 'BWH': 1,
 'GBA': 1141,
 'GBD': 1141,
 'GBH': 1141,
 'IWA': 2,
 'IWD': 2,
 'IWH': 2,
 'LBA': 1,
 'LBD': 1,
 'LBH': 1,
 'PSA': 1521,
 'PSD': 1521,
 'PSH': 1521,
 'SJA': 720,
 'SJD': 720,
 'SJH': 720}

Many of the betting odds have null values. Let's remove the columns that have excessive nulls.



In [213]:

    
for key in null_dict.keys():
    if null_dict[key] > 10:
        del matches_reduced[key]
        
matches_reduced.shape









    Out[213]:





(3040, 27)



In [214]:

    
matches_reduced.to_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_Matches_Reduced.csv',index= False)



In [215]:

    
team_attributes = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_team_attributes.csv')
print len(team_attributes)
print team_attributes.head()









    



204
   id  team_fifa_api_id  team_api_id          date  buildUpPlaySpeed  \
0  71                 1         9825  2/22/10 0:00                66   
1  72                 1         9825  2/22/11 0:00                75   
2  73                 1         9825  2/22/12 0:00                25   
3  74                 1         9825  9/20/13 0:00                30   
4  75                 1         9825  9/19/14 0:00                59   

  buildUpPlaySpeedClass  buildUpPlayDribbling buildUpPlayDribblingClass  \
0              Balanced                   NaN                    Little   
1                  Fast                   NaN                    Little   
2                  Slow                   NaN                    Little   
3                  Slow                   NaN                    Little   
4              Balanced                  51.0                    Normal   

   buildUpPlayPassing buildUpPlayPassingClass  ...   \
0                  30                   Short  ...    
1                  40                   Mixed  ...    
2                  32                   Short  ...    
3                  29                   Short  ...    
4                  26                   Short  ...    

  chanceCreationShootingClass  chanceCreationPositioningClass defencePressure  \
0                      Normal                       Free Form              30   
1                      Normal                       Free Form              50   
2                      Little                       Free Form              57   
3                      Normal                       Free Form              64   
4                      Normal                       Free Form              51   

   defencePressureClass defenceAggression  defenceAggressionClass  \
0                  Deep                40                   Press   
1                Medium                40                   Press   
2                Medium                57                   Press   
3                Medium                54                   Press   
4                Medium                44                   Press   

  defenceTeamWidth defenceTeamWidthClass  defenceDefenderLineClass    ID  
0               50                Normal                     Cover  9825  
1               45                Normal                     Cover  9825  
2               52                Normal                     Cover  9825  
3               52                Normal                     Cover  9825  
4               52                Normal                     Cover  9825  

[5 rows x 26 columns]



In [217]:

    
team_attributes['date']









    Out[217]:





0      2/22/10 0:00
1      2/22/11 0:00
2      2/22/12 0:00
3      9/20/13 0:00
4      9/19/14 0:00
5      9/10/15 0:00
6      2/22/10 0:00
7      2/22/11 0:00
8      2/22/12 0:00
9      9/20/13 0:00
10     9/19/14 0:00
11     9/10/15 0:00
12     2/22/10 0:00
13     2/22/11 0:00
14     2/22/12 0:00
15     9/20/13 0:00
16     9/19/14 0:00
17     9/10/15 0:00
18     2/22/10 0:00
19     2/22/11 0:00
20     2/22/12 0:00
21     9/20/13 0:00
22     9/19/14 0:00
23     9/10/15 0:00
24     2/22/10 0:00
25     2/22/11 0:00
26     2/22/12 0:00
27     9/20/13 0:00
28     9/19/14 0:00
29     9/10/15 0:00
           ...     
174    2/22/10 0:00
175    2/22/11 0:00
176    2/22/12 0:00
177    9/20/13 0:00
178    9/19/14 0:00
179    9/10/15 0:00
180    2/22/10 0:00
181    2/22/11 0:00
182    2/22/12 0:00
183    9/20/13 0:00
184    9/19/14 0:00
185    9/10/15 0:00
186    2/22/10 0:00
187    2/22/11 0:00
188    2/22/12 0:00
189    9/20/13 0:00
190    9/19/14 0:00
191    9/10/15 0:00
192    2/22/10 0:00
193    2/22/11 0:00
194    2/22/12 0:00
195    9/20/13 0:00
196    9/19/14 0:00
197    9/10/15 0:00
198    2/22/10 0:00
199    2/22/11 0:00
200    2/22/12 0:00
201    9/20/13 0:00
202    9/19/14 0:00
203    9/10/15 0:00
Name: date, dtype: object



In [75]:

    
team_attributes.columns









    Out[75]:





Index([u'id', u'team_fifa_api_id', u'team_api_id', u'date',
       u'buildUpPlaySpeed', u'buildUpPlaySpeedClass', u'buildUpPlayDribbling',
       u'buildUpPlayDribblingClass', u'buildUpPlayPassing',
       u'buildUpPlayPassingClass', u'buildUpPlayPositioningClass',
       u'chanceCreationPassing', u'chanceCreationPassingClass',
       u'chanceCreationCrossing', u'chanceCreationCrossingClass',
       u'chanceCreationShooting', u'chanceCreationShootingClass',
       u'chanceCreationPositioningClass', u'defencePressure',
       u'defencePressureClass', u'defenceAggression',
       u'defenceAggressionClass', u'defenceTeamWidth',
       u'defenceTeamWidthClass', u'defenceDefenderLineClass', u'ID'],
      dtype='object')



In [97]:

    
null_dict = {}

for col in team_attributes.columns[4:]:
    nulls = team_attributes[col].isnull().sum()
    if nulls > 0:
        null_dict[col] = nulls
    
    if team_attributes[col].dtype == 'int64' or team_attributes[col].dtype == 'float64':
        team_attributes[col].plot(kind = 'hist')
        plt.xlabel(col)
        plt.title(col + 'Histogram')
        plt.show()
    elif team_attributes[col].dtype == 'object':
        team_attributes[col].value_counts().plot(kind ='bar')  #Build up play passing class value counts totals to 204, no nulls
        plt.title(col + 'Bar Chart')
        plt.show()



In [98]:

    
null_dict









    Out[98]:





{'buildUpPlayDribbling': 136}

From 'null_dict' object, only the attribute 'buildUpPlayDribbling' numeric attribute has null values.



In [9]:

    
teams = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_teams.csv')
print len(teams)
print teams.head()









    



34
     id  team_api_id  team_fifa_api_id        team_long_name team_short_name  \
0  3457        10260                11     Manchester United             MUN   
1  3458        10261                13      Newcastle United             NEW   
2  3459         9825                 1               Arsenal             ARS   
3  3460         8659               109  West Bromwich Albion             WBA   
4  3461         8472               106            Sunderland             SUN   

      ID  
0  10260  
1  10261  
2   9825  
3   8659  
4   8472



In [218]:

    
teams.head()









    Out[218]:






  
    
      
      id
      team_api_id
      team_fifa_api_id
      team_long_name
      team_short_name
      ID
    
  
  
    
      0
      3457
      10260
      11
      Manchester United
      MUN
      10260
    
    
      1
      3458
      10261
      13
      Newcastle United
      NEW
      10261
    
    
      2
      3459
      9825
      1
      Arsenal
      ARS
      9825
    
    
      3
      3460
      8659
      109
      West Bromwich Albion
      WBA
      8659
    
    
      4
      3461
      8472
      106
      Sunderland
      SUN
      8472

Because players will not be used in this version of the model, we will not explore these attributes.



In [12]:

    
player_attributes = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/Player_Attributes.csv')
print len(player_attributes)
print player_attributes.head()









    



183978
   id  player_fifa_api_id  player_api_id           date  overall_rating  \
0   1              218353         505942   2/18/16 0:00            67.0   
1   2              218353         505942  11/19/15 0:00            67.0   
2   3              218353         505942   9/21/15 0:00            62.0   
3   4              218353         505942   3/20/15 0:00            61.0   
4   5              218353         505942   2/22/07 0:00            61.0   

   potential preferred_foot attacking_work_rate defensive_work_rate  crossing  \
0       71.0          right              medium              medium      49.0   
1       71.0          right              medium              medium      49.0   
2       66.0          right              medium              medium      49.0   
3       65.0          right              medium              medium      48.0   
4       65.0          right              medium              medium      48.0   

      ...       vision  penalties  marking  standing_tackle  sliding_tackle  \
0     ...         54.0       48.0     65.0             69.0            69.0   
1     ...         54.0       48.0     65.0             69.0            69.0   
2     ...         54.0       48.0     65.0             66.0            69.0   
3     ...         53.0       47.0     62.0             63.0            66.0   
4     ...         53.0       47.0     62.0             63.0            66.0   

   gk_diving  gk_handling  gk_kicking  gk_positioning  gk_reflexes  
0        6.0         11.0        10.0             8.0          8.0  
1        6.0         11.0        10.0             8.0          8.0  
2        6.0         11.0        10.0             8.0          8.0  
3        5.0         10.0         9.0             7.0          7.0  
4        5.0         10.0         9.0             7.0          7.0  

[5 rows x 42 columns]



In [13]:

    
players = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/Players.csv')
print len(players)
print players.head()









    



11060
   id  player_api_id         player_name  player_fifa_api_id       birthday  \
0   1         505942  Aaron Appindangoye              218353   2/29/92 0:00   
1   2         155782     Aaron Cresswell              189615  12/15/89 0:00   
2   3         162549         Aaron Doran              186170   5/13/91 0:00   
3   4          30572       Aaron Galindo              140161    5/8/82 0:00   
4   5          23780        Aaron Hughes               17725   11/8/79 0:00   

   height  weight  
0  182.88     187  
1  170.18     146  
2  170.18     163  
3  182.88     198  
4  182.88     154



In [ ]:

	id	season	stage	date	match_api_id	home_team_api_id	away_team_api_id	home_team_goal	away_team_goal	B365H	...	SJA	VCH	VCD	VCA	GBH	GBD	GBA	BSH	BSD	BSA
0	1729	2008/2009	1	8/17/08 0:00	489042	10260	10261	1	1	1.29	...	10.00	1.28	5.5	12.00	1.30	4.75	10.0	1.29	4.50	11.00
1	1730	2008/2009	1	8/16/08 0:00	489043	9825	8659	1	0	1.20	...	12.00	1.25	6.0	13.00	1.22	5.50	13.0	1.22	5.00	13.00
2	1731	2008/2009	1	8/16/08 0:00	489044	8472	8650	0	1	5.50	...	1.73	5.50	3.8	1.65	5.00	3.40	1.7	4.50	3.40	1.73
3	1732	2008/2009	1	8/16/08 0:00	489045	8654	8528	2	1	1.91	...	3.75	1.90	3.5	4.35	1.91	3.25	4.0	1.91	3.25	3.80
4	1733	2008/2009	1	8/17/08 0:00	489046	10252	8456	4	2	1.91	...	3.75	1.90	3.5	4.35	1.91	3.25	4.0	1.91	3.30	3.75

	id	team_api_id	team_fifa_api_id	team_long_name	team_short_name	ID
0	3457	10260	11	Manchester United	MUN	10260
1	3458	10261	13	Newcastle United	NEW	10261
2	3459	9825	1	Arsenal	ARS	9825
3	3460	8659	109	West Bromwich Albion	WBA	8659
4	3461	8472	106	Sunderland	SUN	8472