Purpose of this script is to create features to be used in model creation


In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import time
import datetime
%matplotlib inline

Load data from processed dataset


In [4]:
matches_raw = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_Matches_Reduced.csv')

matches=matches_raw.copy()
print len(matches)
print matches.shape


3040
(3040, 27)

In [5]:
matches.head()


Out[5]:
id season stage date match_api_id home_team_api_id away_team_api_id home_team_goal away_team_goal B365H ... IWA LBH LBD LBA WHH WHD WHA VCH VCD VCA
0 1729 2008/2009 1 8/17/08 0:00 489042 10260 10261 1 1 1.29 ... 8.50 1.25 4.5 10.00 1.25 4.5 10.00 1.28 5.5 12.00
1 1730 2008/2009 1 8/16/08 0:00 489043 9825 8659 1 0 1.20 ... 11.00 1.20 5.0 11.00 1.17 5.5 12.00 1.25 6.0 13.00
2 1731 2008/2009 1 8/16/08 0:00 489044 8472 8650 0 1 5.50 ... 1.65 4.50 3.3 1.67 5.50 3.3 1.57 5.50 3.8 1.65
3 1732 2008/2009 1 8/16/08 0:00 489045 8654 8528 2 1 1.91 ... 3.80 1.80 3.2 4.00 1.83 3.2 3.75 1.90 3.5 4.35
4 1733 2008/2009 1 8/17/08 0:00 489046 10252 8456 4 2 1.91 ... 3.30 1.83 3.2 3.75 1.91 3.2 3.50 1.90 3.5 4.35

5 rows × 27 columns

First job is to create a target variable.

This will be a 3 category classification problem:

  • Home Win
  • Away Win
  • Draw

In [6]:
matches['Result_Target'] = np.nan

In [7]:
matches.ix[matches['home_team_goal'] > matches['away_team_goal'], 'Result_Target'] = 1
matches.ix[matches['home_team_goal'] < matches['away_team_goal'], 'Result_Target'] = -1
matches.ix[matches['home_team_goal'] == matches['away_team_goal'], 'Result_Target'] = 0

In [8]:
matches['Result_Target'].value_counts().plot(kind = 'bar')
plt.xlabel('Result')
plt.ylabel('Frequency')
plt.title('Result_Target Distribution Frequency')


Out[8]:
<matplotlib.text.Text at 0x114cf4c90>

In [9]:
matches.head()  # Sanity check that the results column makes sense - Looks Ok!


Out[9]:
id season stage date match_api_id home_team_api_id away_team_api_id home_team_goal away_team_goal B365H ... LBH LBD LBA WHH WHD WHA VCH VCD VCA Result_Target
0 1729 2008/2009 1 8/17/08 0:00 489042 10260 10261 1 1 1.29 ... 1.25 4.5 10.00 1.25 4.5 10.00 1.28 5.5 12.00 0.0
1 1730 2008/2009 1 8/16/08 0:00 489043 9825 8659 1 0 1.20 ... 1.20 5.0 11.00 1.17 5.5 12.00 1.25 6.0 13.00 1.0
2 1731 2008/2009 1 8/16/08 0:00 489044 8472 8650 0 1 5.50 ... 4.50 3.3 1.67 5.50 3.3 1.57 5.50 3.8 1.65 -1.0
3 1732 2008/2009 1 8/16/08 0:00 489045 8654 8528 2 1 1.91 ... 1.80 3.2 4.00 1.83 3.2 3.75 1.90 3.5 4.35 1.0
4 1733 2008/2009 1 8/17/08 0:00 489046 10252 8456 4 2 1.91 ... 1.83 3.2 3.75 1.91 3.2 3.50 1.90 3.5 4.35 1.0

5 rows × 28 columns

Feature Creation Step


In [ ]:
## Season Standings

In [ ]:
def SeasonStanding(season):
    
    stages = range(1,39)
    
    points_dict = {}
    stage_dict = {}
    
    for stage in stages:
        
        sub = season[season['stage']==stage]
        sub = sub.reset_index(inplace = True)
        
        for i in range(len(sub)):
            
            if sub['home_team_goal'][i] > sub['away_team_goal'][i]:
                points_dict[sub['home_team_api_id']] +=3
Aggregated Odds - Average

In [10]:
## Betting Odds Columns
odds = matches[matches.columns[9:27]].copy()
odds.head()


Out[10]:
B365H B365D B365A BWH BWD BWA IWH IWD IWA LBH LBD LBA WHH WHD WHA VCH VCD VCA
0 1.29 5.5 11.00 1.30 4.75 8.25 1.3 4.4 8.50 1.25 4.5 10.00 1.25 4.5 10.00 1.28 5.5 12.00
1 1.20 6.5 15.00 1.22 5.50 10.00 1.2 5.2 11.00 1.20 5.0 11.00 1.17 5.5 12.00 1.25 6.0 13.00
2 5.50 3.6 1.67 5.00 3.35 1.67 4.5 3.5 1.65 4.50 3.3 1.67 5.50 3.3 1.57 5.50 3.8 1.65
3 1.91 3.4 4.20 1.90 3.20 3.80 1.8 3.3 3.80 1.80 3.2 4.00 1.83 3.2 3.75 1.90 3.5 4.35
4 1.91 3.4 4.33 1.95 3.20 3.60 2.0 3.2 3.30 1.83 3.2 3.75 1.91 3.2 3.50 1.90 3.5 4.35

In [11]:
home_odds = []
away_odds = []
draw_odds = []

for colname in odds.columns:
    if colname in ['WHH','WHD','WHA']:
        if colname == 'WHH':
            home_odds.append(colname)
        elif colname == 'WHD':
            draw_odds.append(colname)
        elif colname == 'WHA':
            away_odds.append(colname)
        continue
    if 'H' in colname:
        home_odds.append(colname)
    elif 'A' in colname:
        away_odds.append(colname)
    else:
        draw_odds.append(colname)

In [12]:
# Fill null values with the average of each row

matches['BWH'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['IWH'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['LBH'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['BWD'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['IWD'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['LBD'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['BWA'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['IWA'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['LBA'].fillna(odds[home_odds].mean(axis=1),inplace = True)

In [13]:
odds[home_odds].head()


Out[13]:
B365H BWH IWH LBH WHH VCH
0 1.29 1.30 1.3 1.25 1.25 1.28
1 1.20 1.22 1.2 1.20 1.17 1.25
2 5.50 5.00 4.5 4.50 5.50 5.50
3 1.91 1.90 1.8 1.80 1.83 1.90
4 1.91 1.95 2.0 1.83 1.91 1.90

In [14]:
matches['Average_Home_Odds'] = odds[home_odds].mean(axis=1).round(2)
matches['Average_Away_Odds'] = odds[away_odds].mean(axis=1).round(2)
matches['Average_Draw_Odds'] = odds[draw_odds].mean(axis=1).round(2)

In [15]:
matches.head()


Out[15]:
id season stage date match_api_id home_team_api_id away_team_api_id home_team_goal away_team_goal B365H ... WHH WHD WHA VCH VCD VCA Result_Target Average_Home_Odds Average_Away_Odds Average_Draw_Odds
0 1729 2008/2009 1 8/17/08 0:00 489042 10260 10261 1 1 1.29 ... 1.25 4.5 10.00 1.28 5.5 12.00 0.0 1.28 9.96 4.86
1 1730 2008/2009 1 8/16/08 0:00 489043 9825 8659 1 0 1.20 ... 1.17 5.5 12.00 1.25 6.0 13.00 1.0 1.21 12.00 5.62
2 1731 2008/2009 1 8/16/08 0:00 489044 8472 8650 0 1 5.50 ... 5.50 3.3 1.57 5.50 3.8 1.65 -1.0 5.08 1.65 3.48
3 1732 2008/2009 1 8/16/08 0:00 489045 8654 8528 2 1 1.91 ... 1.83 3.2 3.75 1.90 3.5 4.35 1.0 1.86 3.98 3.30
4 1733 2008/2009 1 8/17/08 0:00 489046 10252 8456 4 2 1.91 ... 1.91 3.2 3.50 1.90 3.5 4.35 1.0 1.92 3.80 3.28

5 rows × 31 columns


In [16]:
matches[home_odds].head()


Out[16]:
B365H BWH IWH LBH WHH VCH
0 1.29 1.30 1.3 1.25 1.25 1.28
1 1.20 1.22 1.2 1.20 1.17 1.25
2 5.50 5.00 4.5 4.50 5.50 5.50
3 1.91 1.90 1.8 1.80 1.83 1.90
4 1.91 1.95 2.0 1.83 1.91 1.90

In [17]:
matches[away_odds].head()


Out[17]:
B365A BWA IWA LBA WHA VCA
0 11.00 8.25 8.50 10.00 10.00 12.00
1 15.00 10.00 11.00 11.00 12.00 13.00
2 1.67 1.67 1.65 1.67 1.57 1.65
3 4.20 3.80 3.80 4.00 3.75 4.35
4 4.33 3.60 3.30 3.75 3.50 4.35

In [18]:
matches[draw_odds].head()


Out[18]:
B365D BWD IWD LBD WHD VCD
0 5.5 4.75 4.4 4.5 4.5 5.5
1 6.5 5.50 5.2 5.0 5.5 6.0
2 3.6 3.35 3.5 3.3 3.3 3.8
3 3.4 3.20 3.3 3.2 3.2 3.5
4 3.4 3.20 3.2 3.2 3.2 3.5

All columns look ok! Let's finally check and make sure there are no NAN values


In [19]:
null_dict = {}

for col in matches.columns:
    nulls = matches[col].isnull().sum()
    if nulls > 0:
        null_dict[col] = nulls

null_dict


Out[19]:
{}
Last K-Goals

This feature will create 4 new features

  • Home_Team_Last_K_Scored
  • Home_Team_Last_K_Conceded
  • Away_Team_Last_K_Scored
  • Away_Team_Last_K_Conceded

Games where last k value cannot be calculated, attribute will be imputed with -1


In [20]:
def LastKGoals(season_df,k):
    
    
    ## Function output:
    ##  DataFrame {match_api_id, home_team_last_k_scored , home_team_last_k_conceded, away_team_last_k_scored, away_team_last_k_conceded}
    
    home_team_last_k_scored = []
    home_team_last_k_conceded = []
    away_team_last_k_scored = []
    away_team_last_k_conceded = []
    match_api_id = []
    
    for i in range(1,39):
        # if last-k metric cannot be computed, impute with -1
        if i <= k:
            subset = season_df[season_df['stage'] == i]
            match_id = subset['match_api_id'].tolist()
            match_api_id.extend(match_id)
            
            # Impute values of -1 where last-k metric cannot be computed
            home_team_last_k_scored.extend([-1]*10)
            home_team_last_k_conceded.extend([-1]*10)
            away_team_last_k_scored.extend([-1]*10)
            away_team_last_k_conceded.extend([-1]*10)
            
            continue       

        #create subset of games in the k-stages before
        subset = season_df[season_df['stage'] >= i-k]
        subset = subset[subset['stage'] < i]
        
        cur_stage = season_df[season_df['stage'] == i]
        
        home_teams = cur_stage['home_team_api_id'].tolist()
        away_teams = cur_stage['away_team_api_id'].tolist()
        match_api_id.extend(cur_stage['match_api_id'].tolist())
        
        for team in home_teams:
            prev_home = subset[subset['home_team_api_id'] == team]
            prev_away = subset[subset['away_team_api_id'] == team]
            
            #last-k-scored
            prev_home_scored = prev_home['home_team_goal'][subset['home_team_api_id'] == team].tolist()
            prev_away_scored = prev_away['away_team_goal'][subset['away_team_api_id'] == team].tolist()
            #last-k-conceded
            prev_home_conceded = prev_home['away_team_goal'][subset['home_team_api_id'] == team].tolist()
            prev_away_conceded = prev_away['home_team_goal'][subset['away_team_api_id'] == team].tolist()            
            
            home_team_last_k_scored.append(sum(prev_home_scored)+sum(prev_away_scored))
            home_team_last_k_conceded.append(sum(prev_home_conceded)+sum(prev_away_conceded))
            
        for team in away_teams:
            prev_home = subset[subset['home_team_api_id'] == team]
            prev_away = subset[subset['away_team_api_id'] == team]
            
            #last-k-scored
            prev_home_scored = prev_home['home_team_goal'][subset['home_team_api_id'] == team].tolist()
            prev_away_scored = prev_away['away_team_goal'][subset['away_team_api_id'] == team].tolist()
            #last-k-conceded
            prev_home_conceded = prev_home['away_team_goal'][subset['home_team_api_id'] == team].tolist()
            prev_away_conceded = prev_away['home_team_goal'][subset['away_team_api_id'] == team].tolist()            
            
            away_team_last_k_scored.append(sum(prev_home_scored)+sum(prev_away_scored))
            away_team_last_k_conceded.append(sum(prev_home_conceded)+sum(prev_away_conceded))
    
    last_k_df = pd.DataFrame({'home_team_last_k_scored':home_team_last_k_scored,\
                                  'home_team_last_k_conceded': home_team_last_k_conceded,\
                                  'away_team_last_k_scored': away_team_last_k_scored,\
                                  'away_team_last_k_conceded':away_team_last_k_conceded,\
                                  'match_api_id':match_api_id})
    
    
    
    return last_k_df

In [21]:
def CreateLastKFeature(df,lastk):
    seasons = df['season'].unique()
    
    count = 0
    
    for year in seasons:
    
        matches_season = matches[matches['season'] == year]
    
        last_k_goals_df = LastKGoals(matches_season,lastk)
    
        if count == 0:
            home_k_scored = last_k_goals_df['home_team_last_k_scored'].tolist()
            home_k_conceded = last_k_goals_df['home_team_last_k_conceded'].tolist()
            away_k_scored = last_k_goals_df['away_team_last_k_scored'].tolist()
            away_k_conceded = last_k_goals_df['away_team_last_k_conceded'].tolist()
            match_id = last_k_goals_df['match_api_id'].tolist()
        else:
            home_k_scored.extend(last_k_goals_df['home_team_last_k_scored'].tolist())
            home_k_conceded.extend(last_k_goals_df['home_team_last_k_conceded'].tolist())
            away_k_scored.extend(last_k_goals_df['away_team_last_k_scored'].tolist())
            away_k_conceded.extend(last_k_goals_df['away_team_last_k_conceded'].tolist())
            match_id.extend(last_k_goals_df['match_api_id'].tolist())
    
        count += 1
        
        last_k_df = pd.DataFrame({'home_last_%d_scored'%(lastk):home_k_scored,\
                          'home_last_%d_conceded'%(lastk): home_k_conceded,\
                          'away_last_%d_scored'%(lastk): away_k_scored,\
                          'away_last_%d_concede'%(lastk): away_k_conceded,\
                          'match_api_id': match_id})
        
        
    return last_k_df

In [22]:
## Create features for k = 2,3,4,5

start = time.time()
for k_value in range(2,6):
    matches = matches.merge(CreateLastKFeature(matches,k_value), how='inner', on = 'match_api_id')

end = time.time()

diff = end-start

print 'Adding new k features took ' + str(round(diff,2)) + ' seconds'
print matches.shape


Adding new k features took 113.53 seconds
(3040, 47)

In [23]:
matches.tail()


Out[23]:
id season stage date match_api_id home_team_api_id away_team_api_id home_team_goal away_team_goal B365H ... home_last_3_conceded home_last_3_scored away_last_4_concede away_last_4_scored home_last_4_conceded home_last_4_scored away_last_5_concede away_last_5_scored home_last_5_conceded home_last_5_scored
3035 4764 2015/2016 9 10/17/15 0:00 1988795 8466 8197 2 2 1.75 ... 5 8 10 9 5 8 11 10 5 11
3036 4765 2015/2016 9 10/19/15 0:00 1988796 10003 10194 0 1 2.05 ... 5 3 5 5 6 3 6 5 7 5
3037 4766 2015/2016 9 10/17/15 0:00 1988797 8586 8650 0 0 2.50 ... 3 7 7 6 3 8 10 6 3 8
3038 4767 2015/2016 9 10/17/15 0:00 1988798 9817 9825 0 3 6.00 ... 3 3 4 10 3 4 4 11 5 4
3039 4768 2015/2016 9 10/17/15 0:00 1988799 8659 8472 1 0 2.05 ... 5 3 8 2 5 3 10 4 5 4

5 rows × 47 columns

Lets check the results and make sure there aren't any missing values


In [24]:
null_dict = {}

for col in matches.columns:
    nulls = matches[col].isnull().sum()
    if nulls > 0:
        null_dict[col] = nulls

null_dict


Out[24]:
{}

Great! There aren't any nulls


In [25]:
#Let's do some investigation on the new features

cols = matches.columns[31:]

for col in cols:
    matches[col].plot(kind='hist')
    plt.title(col)
    plt.show()


This concludes the 'last-k-goals' features!

Day of the week

The goal of this feature is to append a categorical feature 'Day of the Week' to each match. Because some matches happen during the middle of the week, maybe that could provide some information to the model.


In [26]:
dates_raw = matches['date'].tolist()
dates_split = []
for date in dates_raw:
    dates_split.append(date.split(' ')[0])

dates_split[0]


Out[26]:
'8/17/08'

In [27]:
day_of_week = map(lambda x: time.strptime(x,'%m/%d/%y').tm_wday ,dates_split)

matches['day_of_week'] = day_of_week

In [28]:
matches['day_of_week'].plot(kind='hist')   #Most matches played on saturday (day of week =5), but matches are also played on other days


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x11994cb10>

In [29]:
matches['day_of_week'].value_counts()   #Most matches played on saturday (day of week =5), but matches are also played on other days


Out[29]:
5    1718
6     736
2     230
0     161
1     150
4      23
3      22
Name: day_of_week, dtype: int64

In [30]:
# Let's re-investigate our matches dataframe

matches.tail()


Out[30]:
id season stage date match_api_id home_team_api_id away_team_api_id home_team_goal away_team_goal B365H ... home_last_3_scored away_last_4_concede away_last_4_scored home_last_4_conceded home_last_4_scored away_last_5_concede away_last_5_scored home_last_5_conceded home_last_5_scored day_of_week
3035 4764 2015/2016 9 10/17/15 0:00 1988795 8466 8197 2 2 1.75 ... 8 10 9 5 8 11 10 5 11 5
3036 4765 2015/2016 9 10/19/15 0:00 1988796 10003 10194 0 1 2.05 ... 3 5 5 6 3 6 5 7 5 0
3037 4766 2015/2016 9 10/17/15 0:00 1988797 8586 8650 0 0 2.50 ... 7 7 6 3 8 10 6 3 8 5
3038 4767 2015/2016 9 10/17/15 0:00 1988798 9817 9825 0 3 6.00 ... 3 4 10 3 4 4 11 5 4 5
3039 4768 2015/2016 9 10/17/15 0:00 1988799 8659 8472 1 0 2.05 ... 3 8 2 5 3 10 4 5 4 5

5 rows × 48 columns


In [31]:
day_of_week_dummies = pd.get_dummies(matches['day_of_week'])
day_of_week_dummies.columns = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']

#Add new binary columns to matches dataframe

matches['Monday'] = day_of_week_dummies.Monday
matches['Tuesday'] = day_of_week_dummies.Tuesday
matches['Wednesday'] = day_of_week_dummies.Wednesday
matches['Thursday'] = day_of_week_dummies.Thursday
matches['Friday'] = day_of_week_dummies.Friday
matches['Saturday'] = day_of_week_dummies.Saturday
matches['Sunday'] = day_of_week_dummies.Sunday

del matches['day_of_week']

In [32]:
matches.head()


Out[32]:
id season stage date match_api_id home_team_api_id away_team_api_id home_team_goal away_team_goal B365H ... away_last_5_scored home_last_5_conceded home_last_5_scored Monday Tuesday Wednesday Thursday Friday Saturday Sunday
0 1729 2008/2009 1 8/17/08 0:00 489042 10260 10261 1 1 1.29 ... -1 -1 -1 0.0 0.0 0.0 0.0 0.0 0.0 1.0
1 1730 2008/2009 1 8/16/08 0:00 489043 9825 8659 1 0 1.20 ... -1 -1 -1 0.0 0.0 0.0 0.0 0.0 1.0 0.0
2 1731 2008/2009 1 8/16/08 0:00 489044 8472 8650 0 1 5.50 ... -1 -1 -1 0.0 0.0 0.0 0.0 0.0 1.0 0.0
3 1732 2008/2009 1 8/16/08 0:00 489045 8654 8528 2 1 1.91 ... -1 -1 -1 0.0 0.0 0.0 0.0 0.0 1.0 0.0
4 1733 2008/2009 1 8/17/08 0:00 489046 10252 8456 4 2 1.91 ... -1 -1 -1 0.0 0.0 0.0 0.0 0.0 0.0 1.0

5 rows × 54 columns

Now let's add some information about each team


In [33]:
teams = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_teams.csv')
print teams.shape
teams.head()


(34, 6)
Out[33]:
id team_api_id team_fifa_api_id team_long_name team_short_name ID
0 3457 10260 11 Manchester United MUN 10260
1 3458 10261 13 Newcastle United NEW 10261
2 3459 9825 1 Arsenal ARS 9825
3 3460 8659 109 West Bromwich Albion WBA 8659
4 3461 8472 106 Sunderland SUN 8472

In [34]:
#Let's reduce the dataframe only to the columns we need
keepcols = ['team_api_id','team_long_name']

for col in teams.columns:
    if col not in keepcols:
        del teams[col]

teams.head()


Out[34]:
team_api_id team_long_name
0 10260 Manchester United
1 10261 Newcastle United
2 9825 Arsenal
3 8659 West Bromwich Albion
4 8472 Sunderland

In [35]:
matches = matches.merge(teams,how = 'left', left_on = 'home_team_api_id', right_on = 'team_api_id' )
matches = matches.merge(teams,how = 'left', left_on = 'away_team_api_id', right_on = 'team_api_id')

matches.head()


Out[35]:
id season stage date match_api_id home_team_api_id away_team_api_id home_team_goal away_team_goal B365H ... Tuesday Wednesday Thursday Friday Saturday Sunday team_api_id_x team_long_name_x team_api_id_y team_long_name_y
0 1729 2008/2009 1 8/17/08 0:00 489042 10260 10261 1 1 1.29 ... 0.0 0.0 0.0 0.0 0.0 1.0 10260 Manchester United 10261 Newcastle United
1 1730 2008/2009 1 8/16/08 0:00 489043 9825 8659 1 0 1.20 ... 0.0 0.0 0.0 0.0 1.0 0.0 9825 Arsenal 8659 West Bromwich Albion
2 1731 2008/2009 1 8/16/08 0:00 489044 8472 8650 0 1 5.50 ... 0.0 0.0 0.0 0.0 1.0 0.0 8472 Sunderland 8650 Liverpool
3 1732 2008/2009 1 8/16/08 0:00 489045 8654 8528 2 1 1.91 ... 0.0 0.0 0.0 0.0 1.0 0.0 8654 West Ham United 8528 Wigan Athletic
4 1733 2008/2009 1 8/17/08 0:00 489046 10252 8456 4 2 1.91 ... 0.0 0.0 0.0 0.0 0.0 1.0 10252 Aston Villa 8456 Manchester City

5 rows × 58 columns


In [36]:
remove_cols = ['team_api_id_x','team_api_id_y']

for col in matches.columns:
    if col in remove_cols:
        del matches[col]
        
matches.rename(columns={'team_long_name_x': 'Home_Team', 'team_long_name_y': 'Away_Team'}, inplace=True)

In [37]:
matches.season.value_counts()


Out[37]:
2009/2010    380
2013/2014    380
2010/2011    380
2011/2012    380
2012/2013    380
2014/2015    380
2008/2009    380
2015/2016    380
Name: season, dtype: int64

In [38]:
print matches.head()
print matches.shape  # We now have 56 total columns (some of them we will get rid of..)


     id     season  stage          date  match_api_id  home_team_api_id  \
0  1729  2008/2009      1  8/17/08 0:00        489042             10260   
1  1730  2008/2009      1  8/16/08 0:00        489043              9825   
2  1731  2008/2009      1  8/16/08 0:00        489044              8472   
3  1732  2008/2009      1  8/16/08 0:00        489045              8654   
4  1733  2008/2009      1  8/17/08 0:00        489046             10252   

   away_team_api_id  home_team_goal  away_team_goal  B365H  \
0             10261               1               1   1.29   
1              8659               1               0   1.20   
2              8650               0               1   5.50   
3              8528               2               1   1.91   
4              8456               4               2   1.91   

           ...           home_last_5_scored  Monday  Tuesday  Wednesday  \
0          ...                           -1     0.0      0.0        0.0   
1          ...                           -1     0.0      0.0        0.0   
2          ...                           -1     0.0      0.0        0.0   
3          ...                           -1     0.0      0.0        0.0   
4          ...                           -1     0.0      0.0        0.0   

   Thursday  Friday  Saturday  Sunday          Home_Team             Away_Team  
0       0.0     0.0       0.0     1.0  Manchester United      Newcastle United  
1       0.0     0.0       1.0     0.0            Arsenal  West Bromwich Albion  
2       0.0     0.0       1.0     0.0         Sunderland             Liverpool  
3       0.0     0.0       1.0     0.0    West Ham United        Wigan Athletic  
4       0.0     0.0       0.0     1.0        Aston Villa       Manchester City  

[5 rows x 56 columns]
(3040, 56)

Load Team Attributes


In [39]:
team_attributes = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_team_attributes.csv')
team_attributes.head()


Out[39]:
id team_fifa_api_id team_api_id date buildUpPlaySpeed buildUpPlaySpeedClass buildUpPlayDribbling buildUpPlayDribblingClass buildUpPlayPassing buildUpPlayPassingClass ... chanceCreationShootingClass chanceCreationPositioningClass defencePressure defencePressureClass defenceAggression defenceAggressionClass defenceTeamWidth defenceTeamWidthClass defenceDefenderLineClass ID
0 71 1 9825 2/22/10 0:00 66 Balanced NaN Little 30 Short ... Normal Free Form 30 Deep 40 Press 50 Normal Cover 9825
1 72 1 9825 2/22/11 0:00 75 Fast NaN Little 40 Mixed ... Normal Free Form 50 Medium 40 Press 45 Normal Cover 9825
2 73 1 9825 2/22/12 0:00 25 Slow NaN Little 32 Short ... Little Free Form 57 Medium 57 Press 52 Normal Cover 9825
3 74 1 9825 9/20/13 0:00 30 Slow NaN Little 29 Short ... Normal Free Form 64 Medium 54 Press 52 Normal Cover 9825
4 75 1 9825 9/19/14 0:00 59 Balanced 51.0 Normal 26 Short ... Normal Free Form 51 Medium 44 Press 52 Normal Cover 9825

5 rows × 26 columns


In [40]:
# Let's see what columns make up this dataframe
team_attributes.columns


Out[40]:
Index([u'id', u'team_fifa_api_id', u'team_api_id', u'date',
       u'buildUpPlaySpeed', u'buildUpPlaySpeedClass', u'buildUpPlayDribbling',
       u'buildUpPlayDribblingClass', u'buildUpPlayPassing',
       u'buildUpPlayPassingClass', u'buildUpPlayPositioningClass',
       u'chanceCreationPassing', u'chanceCreationPassingClass',
       u'chanceCreationCrossing', u'chanceCreationCrossingClass',
       u'chanceCreationShooting', u'chanceCreationShootingClass',
       u'chanceCreationPositioningClass', u'defencePressure',
       u'defencePressureClass', u'defenceAggression',
       u'defenceAggressionClass', u'defenceTeamWidth',
       u'defenceTeamWidthClass', u'defenceDefenderLineClass', u'ID'],
      dtype='object')

In [41]:
# Remove columns that we won't use
remove_cols = ['buildUpPlaySpeedClass','buildUpPlayDribbling','buildUpPlayPassingClass','chanceCreationPassingClass',\
               'chanceCreationCrossingClass','chanceCreationShootingClass','defencePressureClass','defenceAggressionClass',\
               'defenceTeamWidthClass','ID']

for col in team_attributes.columns:
    if col in remove_cols:
        del team_attributes[col]
        
team_attributes.shape ## Now we have just 16 columns


Out[41]:
(204, 16)

In [42]:
# Split dates to get just the date component
dates_raw = team_attributes['date'].tolist()
dates_split = []
for date in dates_raw:
    dates_split.append(date.split(' ')[0])

In [43]:
# Extract the year and month from each record and append to team_attributes dataframe
team_attributes['month'] = map(lambda x: time.strptime(x,'%m/%d/%y').tm_mon ,dates_split)
team_attributes['year'] = map(lambda x: time.strptime(x,'%m/%d/%y').tm_year ,dates_split)

In [44]:
def DetermineSeason(df):
    if df['year'] == 2008:
        return '2008/2009'
    if df['year'] == 2009:
        if df['month'] < 8:
            return '2008/2009'
        else:
            return '2009/2010'
    if df['year'] == 2010:
        if df['month'] < 8:
            return '2009/2010'
        else:
            return '2010/2011'
    if df['year'] == 2011:
        if df['month'] < 8:
            return '2010/2011'
        else:
            return '2011/2012'
    if df['year'] == 2012:
        if df['month'] < 8:
            return '2011/2012'
        else:
            return '2012/2013'
    if df['year'] == 2013:
        if df['month'] < 8:
            return '2012/2013'
        else:
            return '2013/2014'
    if df['year'] == 2014:
        if df['month'] < 8:
            return '2013/2014'
        else:
            return '2014/2015'
    if df['year'] == 2015:
        if df['month'] < 8:
            return '2014/2015'
        else:
            return '2015/2016'
    if df['year'] == 2016:
        if df['month'] < 8:
            return '2015/2016'
        else:
            return '2016/2017'

In [45]:
team_attributes['season'] = team_attributes.apply(DetermineSeason, axis = 1)

In [46]:
team_attributes.head(20)


Out[46]:
id team_fifa_api_id team_api_id date buildUpPlaySpeed buildUpPlayDribblingClass buildUpPlayPassing buildUpPlayPositioningClass chanceCreationPassing chanceCreationCrossing chanceCreationShooting chanceCreationPositioningClass defencePressure defenceAggression defenceTeamWidth defenceDefenderLineClass month year season
0 71 1 9825 2/22/10 0:00 66 Little 30 Free Form 30 45 35 Free Form 30 40 50 Cover 2 2010 2009/2010
1 72 1 9825 2/22/11 0:00 75 Little 40 Free Form 40 45 65 Free Form 50 40 45 Cover 2 2011 2010/2011
2 73 1 9825 2/22/12 0:00 25 Little 32 Organised 41 34 30 Free Form 57 57 52 Cover 2 2012 2011/2012
3 74 1 9825 9/20/13 0:00 30 Little 29 Organised 41 55 39 Free Form 64 54 52 Cover 9 2013 2013/2014
4 75 1 9825 9/19/14 0:00 59 Normal 26 Organised 28 55 64 Free Form 51 44 52 Cover 9 2014 2014/2015
5 76 1 9825 9/10/15 0:00 59 Normal 30 Organised 28 44 46 Free Form 51 44 52 Cover 9 2015 2015/2016
6 77 2 10252 2/22/10 0:00 70 Little 59 Organised 65 70 50 Free Form 30 70 30 Cover 2 2010 2009/2010
7 78 2 10252 2/22/11 0:00 65 Little 50 Organised 45 55 50 Organised 35 45 65 Cover 2 2011 2010/2011
8 79 2 10252 2/22/12 0:00 67 Little 58 Organised 44 60 56 Organised 43 50 52 Cover 2 2012 2011/2012
9 80 2 10252 9/20/13 0:00 63 Little 59 Organised 52 55 34 Organised 39 41 54 Cover 9 2013 2013/2014
10 81 2 10252 9/19/14 0:00 66 Little 72 Organised 63 48 56 Organised 39 42 52 Cover 9 2014 2014/2015
11 82 2 10252 9/10/15 0:00 63 Normal 54 Organised 60 48 38 Organised 35 44 54 Cover 9 2015 2015/2016
12 167 88 8658 2/22/10 0:00 70 Little 70 Organised 70 70 70 Organised 70 70 70 Cover 2 2010 2009/2010
13 168 88 8658 2/22/11 0:00 45 Little 60 Organised 35 50 40 Organised 30 60 45 Cover 2 2011 2010/2011
14 169 88 8658 2/22/12 0:00 51 Little 55 Organised 41 62 56 Organised 42 48 54 Cover 2 2012 2011/2012
15 170 88 8658 9/20/13 0:00 56 Little 57 Organised 45 71 41 Organised 42 44 54 Cover 9 2013 2013/2014
16 171 88 8658 9/19/14 0:00 56 Normal 79 Organised 77 71 56 Organised 42 48 54 Cover 9 2014 2014/2015
17 172 88 8658 9/10/15 0:00 56 Little 75 Organised 77 62 64 Organised 42 47 54 Cover 9 2015 2015/2016
18 173 3 8655 2/22/10 0:00 55 Little 70 Organised 60 70 55 Organised 45 70 45 Cover 2 2010 2009/2010
19 174 3 8655 2/22/11 0:00 55 Little 65 Organised 50 50 35 Organised 45 60 55 Cover 2 2011 2010/2011

In [47]:
team_attributes['season'].unique()   # By using team attributes, it eliminates data from 2008/2009 and 2012/2013 season! shoot..


Out[47]:
array(['2009/2010', '2010/2011', '2011/2012', '2013/2014', '2014/2015',
       '2015/2016'], dtype=object)

In [48]:
team_attributes['season'].value_counts()


Out[48]:
2014/2015    34
2009/2010    34
2010/2011    34
2015/2016    34
2013/2014    34
2011/2012    34
Name: season, dtype: int64

In [49]:
## Let's test merging the team features with the matches dataframe

matches = matches.merge(team_attributes, how = 'inner', left_on = ['home_team_api_id','season'], right_on = \
                  ['team_api_id','season'])

matches.head()


Out[49]:
id_x season stage date_x match_api_id home_team_api_id away_team_api_id home_team_goal away_team_goal B365H ... chanceCreationPassing chanceCreationCrossing chanceCreationShooting chanceCreationPositioningClass defencePressure defenceAggression defenceTeamWidth defenceDefenderLineClass month year
0 2109 2009/2010 1 8/16/09 0:00 658571 10260 8658 1 0 1.20 ... 45 70 65 Free Form 40 50 40 Cover 2 2010
1 2130 2009/2010 11 10/31/09 0:00 658710 10260 8655 2 0 1.17 ... 45 70 65 Free Form 40 50 40 Cover 2 2010
2 2156 2009/2010 13 11/21/09 0:00 658796 10260 8668 3 0 1.40 ... 45 70 65 Free Form 40 50 40 Cover 2 2010
3 2179 2009/2010 16 12/12/09 0:00 658819 10260 10252 0 1 1.50 ... 45 70 65 Free Form 40 50 40 Cover 2 2010
4 2192 2009/2010 17 12/15/09 0:00 658832 10260 8602 3 0 1.17 ... 45 70 65 Free Form 40 50 40 Cover 2 2010

5 rows × 74 columns


In [50]:
matches = matches.merge(team_attributes, how = 'inner', left_on = ['away_team_api_id','season'], right_on = \
                  ['team_api_id','season'])

In [51]:
matches.shape  # We have reduced the rows from 3040 to 2280...lost 25% of the data!.. We lose 2008/2009 and 2012/2013


Out[51]:
(2280, 92)

In [52]:
matches.season.value_counts()


Out[52]:
2009/2010    380
2013/2014    380
2010/2011    380
2011/2012    380
2014/2015    380
2015/2016    380
Name: season, dtype: int64

Stage to categorical


In [ ]:
def Stage2Cat(df):
    if df['stage'] < 22:
        return 'Before Transfer Window'
    else:
        return 'After Transfer Window'

In [ ]:
matches['Stage_Cat'] = matches.apply(Stage2Cat, axis = 1)

In [ ]:
stage_cat_dummies = pd.get_dummies(matches['Stage_Cat'])
del matches['Stage_Cat']
matches = pd.concat([matches,stage_cat_dummies], axis = 1)

The goal of this feature is to have a flag that indicates if a team was newly promoted that season


In [57]:
promoted_teams = {'2009/2010': ['Wolverhampton Wanderers','Birmingham City','Burnley'],\
                  '2010/2011': ['Newcastle United','West Bromwich Albion','Blackpool'],\
                  '2011/2012': ['Queens Park Rangers','Norwich City','Swansea City'],\
                  '2012/2013': ['Reading','Southampton','West Ham United'],\
                  '2013/2014': ['Cardiff City','Hull City','Crystal Palace'],\
                  '2014/2015': ['Leicester City','Burnley','Queens Park Rangers'],\
                  '2015/2016': ['AFC Bournemouth','Watford','Norwich City']}

In [58]:
def HomePromotedTeam(df):
    seasonlist = matches['season'].unique().tolist()
    for season in seasonlist:
        if df['Home_Team'] in promoted_teams[season]:
            return 1
        else:
            return 0
def AwayPromotedTeam(df):
    seasonlist = matches['season'].unique().tolist()
    for season in seasonlist:
        if df['Away_Team'] in promoted_teams[season]:
            return 1
        else:
            return 0

In [59]:
matches['Home_Promoted_Flag'] = matches.apply(HomePromotedTeam, axis=1)
matches['Away_Promoted_Flat'] = matches.apply(AwayPromotedTeam, axis = 1)

In [61]:
matches.shape


Out[61]:
(2280, 95)

Now we need to convert features to numeric or dummies to use in the modeling process


In [58]:
for col in matches.columns:
    if matches[col].dtype == 'object':
        print col


season
date_x
Home_Team
Away_Team
date_y
buildUpPlayDribblingClass_x
buildUpPlayPositioningClass_x
chanceCreationPositioningClass_x
defenceDefenderLineClass_x
date
buildUpPlayDribblingClass_y
buildUpPlayPositioningClass_y
chanceCreationPositioningClass_y
defenceDefenderLineClass_y

In [59]:
def CreateDummies(df,col):
    dummies = pd.get_dummies(df[col])
    del df[col]
    if col == 'Home_Team':
        dummies.columns = dummies.columns +'_home_team'
    elif col == 'Away_Team':
        dummies.columns = dummies.columns +'_away_team'
    else:
        dummies.columns = dummies.columns + '_'+col
    df = pd.concat([df,dummies], axis=1)
    return df

In [60]:
dummy_cols = ['season','Home_Team','Away_Team','buildUpPlayDribblingClass_x',\
              'buildUpPlayPositioningClass_x', 'chanceCreationPositioningClass_x',\
              'defenceDefenderLineClass_x','buildUpPlayDribblingClass_y','buildUpPlayPositioningClass_y',\
              'chanceCreationPositioningClass_y','defenceDefenderLineClass_y']

In [61]:
for dcol in dummy_cols:
    matches = CreateDummies(matches,dcol)

matches.shape


Out[61]:
(2280, 168)

In [62]:
for col in matches.columns:
    if matches[col].dtype == 'object':
        print col


date_x
date_y
date

In [63]:
# Let's remove the date columns that we won't be using anymore
for col in matches.columns:
    if 'date' in col:
        del matches[col]

matches.shape


Out[63]:
(2280, 165)

Last step - Remove all features that we don't want to use


In [64]:
matches.columns[60:70]


Out[64]:
Index([u'defenceAggression_x', u'defenceTeamWidth_x', u'month_x', u'year_x',
       u'id', u'team_fifa_api_id_y', u'team_api_id_y', u'buildUpPlaySpeed_y',
       u'buildUpPlayPassing_y', u'chanceCreationPassing_y'],
      dtype='object')

In [65]:
remove_cols = ['id_x','match_api_id','home_team_api_id','away_team_api_id',\
               'month_x','year_x','id','team_fifa_api_id_y','team_api_id_y',\
               'id_y','team_fifa_api_id_x','team_api_id_x','month_y','year_y']

for col in remove_cols:
    try:
        del matches[col]
    except Exception:
        continue
    
matches.shape


Out[65]:
(2280, 151)

In [66]:
# Finally export matches dataframe for modeling

matches.to_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/20170127_Matches_w_Features.csv',index=None)

In [67]:
from scipy.stats import pearsonr
pearsonr(matches['home_last_4_scored'],matches['Result_Target'])


Out[67]:
(0.14756617452432319, 1.4299129510091769e-12)

Data Analysis


In [91]:
metric_list = ['home_last_%d_scored','home_last_%d_conceded','away_last_%d_scored','away_last_%d_concede']
count = 0
for metric in metric_list:
    corr_list = []
    count +=1
    for i in range(2,6):
        corr_list.append(pearsonr(matches[metric%i],matches['Result_Target'])[0])
    plt.plot(range(2,6),corr_list)
    plt.xlabel('Number of Last K Matches')
    plt.ylabel('Correlation with Result')
    if count == 1:
        plt.title('Home Last K Scored')
    if count == 2:
        plt.title('Home Last K Conceded')
    if count == 3:
        plt.title('Away Last K Scored')
    if count == 4:
        plt.title('Away Last K Conceded')
        
    plt.show()



In [94]:
metric_list = ['Home_Promoted_Flag', 'Away_Promoted_Flat']
count = 0
corr_list = []
for metric in metric_list:
    corr_list.append(pearsonr(matches[metric],matches['Result_Target'])[0])

In [102]:
print 'Home Promoted Flag Correlation with Target= ' + str(round(corr_list[0],2))
print 'Away Promoted Flag Correlation with Target= ' + str(round(corr_list[1],2))


Home Promoted Flag Correlation with Target= -0.07
Away Promoted Flag Correlation with Target= 0.09

In [ ]: