In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import time
import datetime
%matplotlib inline
Load data from processed dataset
In [4]:
matches_raw = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_Matches_Reduced.csv')
matches=matches_raw.copy()
print len(matches)
print matches.shape
In [5]:
matches.head()
Out[5]:
First job is to create a target variable.
This will be a 3 category classification problem:
In [6]:
matches['Result_Target'] = np.nan
In [7]:
matches.ix[matches['home_team_goal'] > matches['away_team_goal'], 'Result_Target'] = 1
matches.ix[matches['home_team_goal'] < matches['away_team_goal'], 'Result_Target'] = -1
matches.ix[matches['home_team_goal'] == matches['away_team_goal'], 'Result_Target'] = 0
In [8]:
matches['Result_Target'].value_counts().plot(kind = 'bar')
plt.xlabel('Result')
plt.ylabel('Frequency')
plt.title('Result_Target Distribution Frequency')
Out[8]:
In [9]:
matches.head() # Sanity check that the results column makes sense - Looks Ok!
Out[9]:
In [ ]:
## Season Standings
In [ ]:
def SeasonStanding(season):
stages = range(1,39)
points_dict = {}
stage_dict = {}
for stage in stages:
sub = season[season['stage']==stage]
sub = sub.reset_index(inplace = True)
for i in range(len(sub)):
if sub['home_team_goal'][i] > sub['away_team_goal'][i]:
points_dict[sub['home_team_api_id']] +=3
In [10]:
## Betting Odds Columns
odds = matches[matches.columns[9:27]].copy()
odds.head()
Out[10]:
In [11]:
home_odds = []
away_odds = []
draw_odds = []
for colname in odds.columns:
if colname in ['WHH','WHD','WHA']:
if colname == 'WHH':
home_odds.append(colname)
elif colname == 'WHD':
draw_odds.append(colname)
elif colname == 'WHA':
away_odds.append(colname)
continue
if 'H' in colname:
home_odds.append(colname)
elif 'A' in colname:
away_odds.append(colname)
else:
draw_odds.append(colname)
In [12]:
# Fill null values with the average of each row
matches['BWH'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['IWH'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['LBH'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['BWD'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['IWD'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['LBD'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['BWA'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['IWA'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['LBA'].fillna(odds[home_odds].mean(axis=1),inplace = True)
In [13]:
odds[home_odds].head()
Out[13]:
In [14]:
matches['Average_Home_Odds'] = odds[home_odds].mean(axis=1).round(2)
matches['Average_Away_Odds'] = odds[away_odds].mean(axis=1).round(2)
matches['Average_Draw_Odds'] = odds[draw_odds].mean(axis=1).round(2)
In [15]:
matches.head()
Out[15]:
In [16]:
matches[home_odds].head()
Out[16]:
In [17]:
matches[away_odds].head()
Out[17]:
In [18]:
matches[draw_odds].head()
Out[18]:
All columns look ok! Let's finally check and make sure there are no NAN values
In [19]:
null_dict = {}
for col in matches.columns:
nulls = matches[col].isnull().sum()
if nulls > 0:
null_dict[col] = nulls
null_dict
Out[19]:
This feature will create 4 new features
Games where last k value cannot be calculated, attribute will be imputed with -1
In [20]:
def LastKGoals(season_df,k):
## Function output:
## DataFrame {match_api_id, home_team_last_k_scored , home_team_last_k_conceded, away_team_last_k_scored, away_team_last_k_conceded}
home_team_last_k_scored = []
home_team_last_k_conceded = []
away_team_last_k_scored = []
away_team_last_k_conceded = []
match_api_id = []
for i in range(1,39):
# if last-k metric cannot be computed, impute with -1
if i <= k:
subset = season_df[season_df['stage'] == i]
match_id = subset['match_api_id'].tolist()
match_api_id.extend(match_id)
# Impute values of -1 where last-k metric cannot be computed
home_team_last_k_scored.extend([-1]*10)
home_team_last_k_conceded.extend([-1]*10)
away_team_last_k_scored.extend([-1]*10)
away_team_last_k_conceded.extend([-1]*10)
continue
#create subset of games in the k-stages before
subset = season_df[season_df['stage'] >= i-k]
subset = subset[subset['stage'] < i]
cur_stage = season_df[season_df['stage'] == i]
home_teams = cur_stage['home_team_api_id'].tolist()
away_teams = cur_stage['away_team_api_id'].tolist()
match_api_id.extend(cur_stage['match_api_id'].tolist())
for team in home_teams:
prev_home = subset[subset['home_team_api_id'] == team]
prev_away = subset[subset['away_team_api_id'] == team]
#last-k-scored
prev_home_scored = prev_home['home_team_goal'][subset['home_team_api_id'] == team].tolist()
prev_away_scored = prev_away['away_team_goal'][subset['away_team_api_id'] == team].tolist()
#last-k-conceded
prev_home_conceded = prev_home['away_team_goal'][subset['home_team_api_id'] == team].tolist()
prev_away_conceded = prev_away['home_team_goal'][subset['away_team_api_id'] == team].tolist()
home_team_last_k_scored.append(sum(prev_home_scored)+sum(prev_away_scored))
home_team_last_k_conceded.append(sum(prev_home_conceded)+sum(prev_away_conceded))
for team in away_teams:
prev_home = subset[subset['home_team_api_id'] == team]
prev_away = subset[subset['away_team_api_id'] == team]
#last-k-scored
prev_home_scored = prev_home['home_team_goal'][subset['home_team_api_id'] == team].tolist()
prev_away_scored = prev_away['away_team_goal'][subset['away_team_api_id'] == team].tolist()
#last-k-conceded
prev_home_conceded = prev_home['away_team_goal'][subset['home_team_api_id'] == team].tolist()
prev_away_conceded = prev_away['home_team_goal'][subset['away_team_api_id'] == team].tolist()
away_team_last_k_scored.append(sum(prev_home_scored)+sum(prev_away_scored))
away_team_last_k_conceded.append(sum(prev_home_conceded)+sum(prev_away_conceded))
last_k_df = pd.DataFrame({'home_team_last_k_scored':home_team_last_k_scored,\
'home_team_last_k_conceded': home_team_last_k_conceded,\
'away_team_last_k_scored': away_team_last_k_scored,\
'away_team_last_k_conceded':away_team_last_k_conceded,\
'match_api_id':match_api_id})
return last_k_df
In [21]:
def CreateLastKFeature(df,lastk):
seasons = df['season'].unique()
count = 0
for year in seasons:
matches_season = matches[matches['season'] == year]
last_k_goals_df = LastKGoals(matches_season,lastk)
if count == 0:
home_k_scored = last_k_goals_df['home_team_last_k_scored'].tolist()
home_k_conceded = last_k_goals_df['home_team_last_k_conceded'].tolist()
away_k_scored = last_k_goals_df['away_team_last_k_scored'].tolist()
away_k_conceded = last_k_goals_df['away_team_last_k_conceded'].tolist()
match_id = last_k_goals_df['match_api_id'].tolist()
else:
home_k_scored.extend(last_k_goals_df['home_team_last_k_scored'].tolist())
home_k_conceded.extend(last_k_goals_df['home_team_last_k_conceded'].tolist())
away_k_scored.extend(last_k_goals_df['away_team_last_k_scored'].tolist())
away_k_conceded.extend(last_k_goals_df['away_team_last_k_conceded'].tolist())
match_id.extend(last_k_goals_df['match_api_id'].tolist())
count += 1
last_k_df = pd.DataFrame({'home_last_%d_scored'%(lastk):home_k_scored,\
'home_last_%d_conceded'%(lastk): home_k_conceded,\
'away_last_%d_scored'%(lastk): away_k_scored,\
'away_last_%d_concede'%(lastk): away_k_conceded,\
'match_api_id': match_id})
return last_k_df
In [22]:
## Create features for k = 2,3,4,5
start = time.time()
for k_value in range(2,6):
matches = matches.merge(CreateLastKFeature(matches,k_value), how='inner', on = 'match_api_id')
end = time.time()
diff = end-start
print 'Adding new k features took ' + str(round(diff,2)) + ' seconds'
print matches.shape
In [23]:
matches.tail()
Out[23]:
Lets check the results and make sure there aren't any missing values
In [24]:
null_dict = {}
for col in matches.columns:
nulls = matches[col].isnull().sum()
if nulls > 0:
null_dict[col] = nulls
null_dict
Out[24]:
Great! There aren't any nulls
In [25]:
#Let's do some investigation on the new features
cols = matches.columns[31:]
for col in cols:
matches[col].plot(kind='hist')
plt.title(col)
plt.show()
This concludes the 'last-k-goals' features!
The goal of this feature is to append a categorical feature 'Day of the Week' to each match. Because some matches happen during the middle of the week, maybe that could provide some information to the model.
In [26]:
dates_raw = matches['date'].tolist()
dates_split = []
for date in dates_raw:
dates_split.append(date.split(' ')[0])
dates_split[0]
Out[26]:
In [27]:
day_of_week = map(lambda x: time.strptime(x,'%m/%d/%y').tm_wday ,dates_split)
matches['day_of_week'] = day_of_week
In [28]:
matches['day_of_week'].plot(kind='hist') #Most matches played on saturday (day of week =5), but matches are also played on other days
Out[28]:
In [29]:
matches['day_of_week'].value_counts() #Most matches played on saturday (day of week =5), but matches are also played on other days
Out[29]:
In [30]:
# Let's re-investigate our matches dataframe
matches.tail()
Out[30]:
In [31]:
day_of_week_dummies = pd.get_dummies(matches['day_of_week'])
day_of_week_dummies.columns = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
#Add new binary columns to matches dataframe
matches['Monday'] = day_of_week_dummies.Monday
matches['Tuesday'] = day_of_week_dummies.Tuesday
matches['Wednesday'] = day_of_week_dummies.Wednesday
matches['Thursday'] = day_of_week_dummies.Thursday
matches['Friday'] = day_of_week_dummies.Friday
matches['Saturday'] = day_of_week_dummies.Saturday
matches['Sunday'] = day_of_week_dummies.Sunday
del matches['day_of_week']
In [32]:
matches.head()
Out[32]:
In [33]:
teams = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_teams.csv')
print teams.shape
teams.head()
Out[33]:
In [34]:
#Let's reduce the dataframe only to the columns we need
keepcols = ['team_api_id','team_long_name']
for col in teams.columns:
if col not in keepcols:
del teams[col]
teams.head()
Out[34]:
In [35]:
matches = matches.merge(teams,how = 'left', left_on = 'home_team_api_id', right_on = 'team_api_id' )
matches = matches.merge(teams,how = 'left', left_on = 'away_team_api_id', right_on = 'team_api_id')
matches.head()
Out[35]:
In [36]:
remove_cols = ['team_api_id_x','team_api_id_y']
for col in matches.columns:
if col in remove_cols:
del matches[col]
matches.rename(columns={'team_long_name_x': 'Home_Team', 'team_long_name_y': 'Away_Team'}, inplace=True)
In [37]:
matches.season.value_counts()
Out[37]:
In [38]:
print matches.head()
print matches.shape # We now have 56 total columns (some of them we will get rid of..)
In [39]:
team_attributes = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_team_attributes.csv')
team_attributes.head()
Out[39]:
In [40]:
# Let's see what columns make up this dataframe
team_attributes.columns
Out[40]:
In [41]:
# Remove columns that we won't use
remove_cols = ['buildUpPlaySpeedClass','buildUpPlayDribbling','buildUpPlayPassingClass','chanceCreationPassingClass',\
'chanceCreationCrossingClass','chanceCreationShootingClass','defencePressureClass','defenceAggressionClass',\
'defenceTeamWidthClass','ID']
for col in team_attributes.columns:
if col in remove_cols:
del team_attributes[col]
team_attributes.shape ## Now we have just 16 columns
Out[41]:
In [42]:
# Split dates to get just the date component
dates_raw = team_attributes['date'].tolist()
dates_split = []
for date in dates_raw:
dates_split.append(date.split(' ')[0])
In [43]:
# Extract the year and month from each record and append to team_attributes dataframe
team_attributes['month'] = map(lambda x: time.strptime(x,'%m/%d/%y').tm_mon ,dates_split)
team_attributes['year'] = map(lambda x: time.strptime(x,'%m/%d/%y').tm_year ,dates_split)
In [44]:
def DetermineSeason(df):
if df['year'] == 2008:
return '2008/2009'
if df['year'] == 2009:
if df['month'] < 8:
return '2008/2009'
else:
return '2009/2010'
if df['year'] == 2010:
if df['month'] < 8:
return '2009/2010'
else:
return '2010/2011'
if df['year'] == 2011:
if df['month'] < 8:
return '2010/2011'
else:
return '2011/2012'
if df['year'] == 2012:
if df['month'] < 8:
return '2011/2012'
else:
return '2012/2013'
if df['year'] == 2013:
if df['month'] < 8:
return '2012/2013'
else:
return '2013/2014'
if df['year'] == 2014:
if df['month'] < 8:
return '2013/2014'
else:
return '2014/2015'
if df['year'] == 2015:
if df['month'] < 8:
return '2014/2015'
else:
return '2015/2016'
if df['year'] == 2016:
if df['month'] < 8:
return '2015/2016'
else:
return '2016/2017'
In [45]:
team_attributes['season'] = team_attributes.apply(DetermineSeason, axis = 1)
In [46]:
team_attributes.head(20)
Out[46]:
In [47]:
team_attributes['season'].unique() # By using team attributes, it eliminates data from 2008/2009 and 2012/2013 season! shoot..
Out[47]:
In [48]:
team_attributes['season'].value_counts()
Out[48]:
In [49]:
## Let's test merging the team features with the matches dataframe
matches = matches.merge(team_attributes, how = 'inner', left_on = ['home_team_api_id','season'], right_on = \
['team_api_id','season'])
matches.head()
Out[49]:
In [50]:
matches = matches.merge(team_attributes, how = 'inner', left_on = ['away_team_api_id','season'], right_on = \
['team_api_id','season'])
In [51]:
matches.shape # We have reduced the rows from 3040 to 2280...lost 25% of the data!.. We lose 2008/2009 and 2012/2013
Out[51]:
In [52]:
matches.season.value_counts()
Out[52]:
In [ ]:
def Stage2Cat(df):
if df['stage'] < 22:
return 'Before Transfer Window'
else:
return 'After Transfer Window'
In [ ]:
matches['Stage_Cat'] = matches.apply(Stage2Cat, axis = 1)
In [ ]:
stage_cat_dummies = pd.get_dummies(matches['Stage_Cat'])
del matches['Stage_Cat']
matches = pd.concat([matches,stage_cat_dummies], axis = 1)
The goal of this feature is to have a flag that indicates if a team was newly promoted that season
In [57]:
promoted_teams = {'2009/2010': ['Wolverhampton Wanderers','Birmingham City','Burnley'],\
'2010/2011': ['Newcastle United','West Bromwich Albion','Blackpool'],\
'2011/2012': ['Queens Park Rangers','Norwich City','Swansea City'],\
'2012/2013': ['Reading','Southampton','West Ham United'],\
'2013/2014': ['Cardiff City','Hull City','Crystal Palace'],\
'2014/2015': ['Leicester City','Burnley','Queens Park Rangers'],\
'2015/2016': ['AFC Bournemouth','Watford','Norwich City']}
In [58]:
def HomePromotedTeam(df):
seasonlist = matches['season'].unique().tolist()
for season in seasonlist:
if df['Home_Team'] in promoted_teams[season]:
return 1
else:
return 0
def AwayPromotedTeam(df):
seasonlist = matches['season'].unique().tolist()
for season in seasonlist:
if df['Away_Team'] in promoted_teams[season]:
return 1
else:
return 0
In [59]:
matches['Home_Promoted_Flag'] = matches.apply(HomePromotedTeam, axis=1)
matches['Away_Promoted_Flat'] = matches.apply(AwayPromotedTeam, axis = 1)
In [61]:
matches.shape
Out[61]:
In [58]:
for col in matches.columns:
if matches[col].dtype == 'object':
print col
In [59]:
def CreateDummies(df,col):
dummies = pd.get_dummies(df[col])
del df[col]
if col == 'Home_Team':
dummies.columns = dummies.columns +'_home_team'
elif col == 'Away_Team':
dummies.columns = dummies.columns +'_away_team'
else:
dummies.columns = dummies.columns + '_'+col
df = pd.concat([df,dummies], axis=1)
return df
In [60]:
dummy_cols = ['season','Home_Team','Away_Team','buildUpPlayDribblingClass_x',\
'buildUpPlayPositioningClass_x', 'chanceCreationPositioningClass_x',\
'defenceDefenderLineClass_x','buildUpPlayDribblingClass_y','buildUpPlayPositioningClass_y',\
'chanceCreationPositioningClass_y','defenceDefenderLineClass_y']
In [61]:
for dcol in dummy_cols:
matches = CreateDummies(matches,dcol)
matches.shape
Out[61]:
In [62]:
for col in matches.columns:
if matches[col].dtype == 'object':
print col
In [63]:
# Let's remove the date columns that we won't be using anymore
for col in matches.columns:
if 'date' in col:
del matches[col]
matches.shape
Out[63]:
Last step - Remove all features that we don't want to use
In [64]:
matches.columns[60:70]
Out[64]:
In [65]:
remove_cols = ['id_x','match_api_id','home_team_api_id','away_team_api_id',\
'month_x','year_x','id','team_fifa_api_id_y','team_api_id_y',\
'id_y','team_fifa_api_id_x','team_api_id_x','month_y','year_y']
for col in remove_cols:
try:
del matches[col]
except Exception:
continue
matches.shape
Out[65]:
In [66]:
# Finally export matches dataframe for modeling
matches.to_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/20170127_Matches_w_Features.csv',index=None)
In [67]:
from scipy.stats import pearsonr
pearsonr(matches['home_last_4_scored'],matches['Result_Target'])
Out[67]:
In [91]:
metric_list = ['home_last_%d_scored','home_last_%d_conceded','away_last_%d_scored','away_last_%d_concede']
count = 0
for metric in metric_list:
corr_list = []
count +=1
for i in range(2,6):
corr_list.append(pearsonr(matches[metric%i],matches['Result_Target'])[0])
plt.plot(range(2,6),corr_list)
plt.xlabel('Number of Last K Matches')
plt.ylabel('Correlation with Result')
if count == 1:
plt.title('Home Last K Scored')
if count == 2:
plt.title('Home Last K Conceded')
if count == 3:
plt.title('Away Last K Scored')
if count == 4:
plt.title('Away Last K Conceded')
plt.show()
In [94]:
metric_list = ['Home_Promoted_Flag', 'Away_Promoted_Flat']
count = 0
corr_list = []
for metric in metric_list:
corr_list.append(pearsonr(matches[metric],matches['Result_Target'])[0])
In [102]:
print 'Home Promoted Flag Correlation with Target= ' + str(round(corr_list[0],2))
print 'Away Promoted Flag Correlation with Target= ' + str(round(corr_list[1],2))
In [ ]: