Purpose of this script is to create features to be used in model creation



In [3]:

    
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import time
import datetime
%matplotlib inline

Load data from processed dataset



In [4]:

    
matches_raw = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_Matches_Reduced.csv')

matches=matches_raw.copy()
print len(matches)
print matches.shape



In [5]:

    
matches.head()









    Out[5]:






  
    
      
      id
      season
      stage
      date
      match_api_id
      home_team_api_id
      away_team_api_id
      home_team_goal
      away_team_goal
      B365H
      ...
      IWA
      LBH
      LBD
      LBA
      WHH
      WHD
      WHA
      VCH
      VCD
      VCA
    
  
  
    
      0
      1729
      2008/2009
      1
      8/17/08 0:00
      489042
      10260
      10261
      1
      1
      1.29
      ...
      8.50
      1.25
      4.5
      10.00
      1.25
      4.5
      10.00
      1.28
      5.5
      12.00
    
    
      1
      1730
      2008/2009
      1
      8/16/08 0:00
      489043
      9825
      8659
      1
      0
      1.20
      ...
      11.00
      1.20
      5.0
      11.00
      1.17
      5.5
      12.00
      1.25
      6.0
      13.00
    
    
      2
      1731
      2008/2009
      1
      8/16/08 0:00
      489044
      8472
      8650
      0
      1
      5.50
      ...
      1.65
      4.50
      3.3
      1.67
      5.50
      3.3
      1.57
      5.50
      3.8
      1.65
    
    
      3
      1732
      2008/2009
      1
      8/16/08 0:00
      489045
      8654
      8528
      2
      1
      1.91
      ...
      3.80
      1.80
      3.2
      4.00
      1.83
      3.2
      3.75
      1.90
      3.5
      4.35
    
    
      4
      1733
      2008/2009
      1
      8/17/08 0:00
      489046
      10252
      8456
      4
      2
      1.91
      ...
      3.30
      1.83
      3.2
      3.75
      1.91
      3.2
      3.50
      1.90
      3.5
      4.35
    
  

5 rows × 27 columns

First job is to create a target variable.

This will be a 3 category classification problem:

Home Win
Away Win
Draw



In [6]:

    
matches['Result_Target'] = np.nan



In [7]:

    
matches.ix[matches['home_team_goal'] > matches['away_team_goal'], 'Result_Target'] = 1
matches.ix[matches['home_team_goal'] < matches['away_team_goal'], 'Result_Target'] = -1
matches.ix[matches['home_team_goal'] == matches['away_team_goal'], 'Result_Target'] = 0



In [8]:

    
matches['Result_Target'].value_counts().plot(kind = 'bar')
plt.xlabel('Result')
plt.ylabel('Frequency')
plt.title('Result_Target Distribution Frequency')









    Out[8]:





<matplotlib.text.Text at 0x114cf4c90>



In [9]:

    
matches.head()  # Sanity check that the results column makes sense - Looks Ok!









    Out[9]:






  
    
      
      id
      season
      stage
      date
      match_api_id
      home_team_api_id
      away_team_api_id
      home_team_goal
      away_team_goal
      B365H
      ...
      LBH
      LBD
      LBA
      WHH
      WHD
      WHA
      VCH
      VCD
      VCA
      Result_Target
    
  
  
    
      0
      1729
      2008/2009
      1
      8/17/08 0:00
      489042
      10260
      10261
      1
      1
      1.29
      ...
      1.25
      4.5
      10.00
      1.25
      4.5
      10.00
      1.28
      5.5
      12.00
      0.0
    
    
      1
      1730
      2008/2009
      1
      8/16/08 0:00
      489043
      9825
      8659
      1
      0
      1.20
      ...
      1.20
      5.0
      11.00
      1.17
      5.5
      12.00
      1.25
      6.0
      13.00
      1.0
    
    
      2
      1731
      2008/2009
      1
      8/16/08 0:00
      489044
      8472
      8650
      0
      1
      5.50
      ...
      4.50
      3.3
      1.67
      5.50
      3.3
      1.57
      5.50
      3.8
      1.65
      -1.0
    
    
      3
      1732
      2008/2009
      1
      8/16/08 0:00
      489045
      8654
      8528
      2
      1
      1.91
      ...
      1.80
      3.2
      4.00
      1.83
      3.2
      3.75
      1.90
      3.5
      4.35
      1.0
    
    
      4
      1733
      2008/2009
      1
      8/17/08 0:00
      489046
      10252
      8456
      4
      2
      1.91
      ...
      1.83
      3.2
      3.75
      1.91
      3.2
      3.50
      1.90
      3.5
      4.35
      1.0
    
  

5 rows × 28 columns

Feature Creation Step



In [ ]:

    
## Season Standings



In [ ]:

    
def SeasonStanding(season):
    
    stages = range(1,39)
    
    points_dict = {}
    stage_dict = {}
    
    for stage in stages:
        
        sub = season[season['stage']==stage]
        sub = sub.reset_index(inplace = True)
        
        for i in range(len(sub)):
            
            if sub['home_team_goal'][i] > sub['away_team_goal'][i]:
                points_dict[sub['home_team_api_id']] +=3

Aggregated Odds - Average



In [10]:

    
## Betting Odds Columns
odds = matches[matches.columns[9:27]].copy()
odds.head()



In [11]:

    
home_odds = []
away_odds = []
draw_odds = []

for colname in odds.columns:
    if colname in ['WHH','WHD','WHA']:
        if colname == 'WHH':
            home_odds.append(colname)
        elif colname == 'WHD':
            draw_odds.append(colname)
        elif colname == 'WHA':
            away_odds.append(colname)
        continue
    if 'H' in colname:
        home_odds.append(colname)
    elif 'A' in colname:
        away_odds.append(colname)
    else:
        draw_odds.append(colname)



In [12]:

    
# Fill null values with the average of each row

matches['BWH'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['IWH'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['LBH'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['BWD'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['IWD'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['LBD'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['BWA'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['IWA'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['LBA'].fillna(odds[home_odds].mean(axis=1),inplace = True)



In [13]:

    
odds[home_odds].head()



In [14]:

    
matches['Average_Home_Odds'] = odds[home_odds].mean(axis=1).round(2)
matches['Average_Away_Odds'] = odds[away_odds].mean(axis=1).round(2)
matches['Average_Draw_Odds'] = odds[draw_odds].mean(axis=1).round(2)



In [15]:

    
matches.head()









    Out[15]:






  
    
      
      id
      season
      stage
      date
      match_api_id
      home_team_api_id
      away_team_api_id
      home_team_goal
      away_team_goal
      B365H
      ...
      WHH
      WHD
      WHA
      VCH
      VCD
      VCA
      Result_Target
      Average_Home_Odds
      Average_Away_Odds
      Average_Draw_Odds
    
  
  
    
      0
      1729
      2008/2009
      1
      8/17/08 0:00
      489042
      10260
      10261
      1
      1
      1.29
      ...
      1.25
      4.5
      10.00
      1.28
      5.5
      12.00
      0.0
      1.28
      9.96
      4.86
    
    
      1
      1730
      2008/2009
      1
      8/16/08 0:00
      489043
      9825
      8659
      1
      0
      1.20
      ...
      1.17
      5.5
      12.00
      1.25
      6.0
      13.00
      1.0
      1.21
      12.00
      5.62
    
    
      2
      1731
      2008/2009
      1
      8/16/08 0:00
      489044
      8472
      8650
      0
      1
      5.50
      ...
      5.50
      3.3
      1.57
      5.50
      3.8
      1.65
      -1.0
      5.08
      1.65
      3.48
    
    
      3
      1732
      2008/2009
      1
      8/16/08 0:00
      489045
      8654
      8528
      2
      1
      1.91
      ...
      1.83
      3.2
      3.75
      1.90
      3.5
      4.35
      1.0
      1.86
      3.98
      3.30
    
    
      4
      1733
      2008/2009
      1
      8/17/08 0:00
      489046
      10252
      8456
      4
      2
      1.91
      ...
      1.91
      3.2
      3.50
      1.90
      3.5
      4.35
      1.0
      1.92
      3.80
      3.28
    
  

5 rows × 31 columns



In [16]:

    
matches[home_odds].head()



In [17]:

    
matches[away_odds].head()



In [18]:

    
matches[draw_odds].head()

All columns look ok! Let's finally check and make sure there are no NAN values



In [19]:

    
null_dict = {}

for col in matches.columns:
    nulls = matches[col].isnull().sum()
    if nulls > 0:
        null_dict[col] = nulls

null_dict









    Out[19]:





{}

Last K-Goals

This feature will create 4 new features

Home_Team_Last_K_Scored
Home_Team_Last_K_Conceded
Away_Team_Last_K_Scored
Away_Team_Last_K_Conceded

Games where last k value cannot be calculated, attribute will be imputed with -1



In [20]:

    
def LastKGoals(season_df,k):
    
    
    ## Function output:
    ##  DataFrame {match_api_id, home_team_last_k_scored , home_team_last_k_conceded, away_team_last_k_scored, away_team_last_k_conceded}
    
    home_team_last_k_scored = []
    home_team_last_k_conceded = []
    away_team_last_k_scored = []
    away_team_last_k_conceded = []
    match_api_id = []
    
    for i in range(1,39):
        # if last-k metric cannot be computed, impute with -1
        if i <= k:
            subset = season_df[season_df['stage'] == i]
            match_id = subset['match_api_id'].tolist()
            match_api_id.extend(match_id)
            
            # Impute values of -1 where last-k metric cannot be computed
            home_team_last_k_scored.extend([-1]*10)
            home_team_last_k_conceded.extend([-1]*10)
            away_team_last_k_scored.extend([-1]*10)
            away_team_last_k_conceded.extend([-1]*10)
            
            continue       

        #create subset of games in the k-stages before
        subset = season_df[season_df['stage'] >= i-k]
        subset = subset[subset['stage'] < i]
        
        cur_stage = season_df[season_df['stage'] == i]
        
        home_teams = cur_stage['home_team_api_id'].tolist()
        away_teams = cur_stage['away_team_api_id'].tolist()
        match_api_id.extend(cur_stage['match_api_id'].tolist())
        
        for team in home_teams:
            prev_home = subset[subset['home_team_api_id'] == team]
            prev_away = subset[subset['away_team_api_id'] == team]
            
            #last-k-scored
            prev_home_scored = prev_home['home_team_goal'][subset['home_team_api_id'] == team].tolist()
            prev_away_scored = prev_away['away_team_goal'][subset['away_team_api_id'] == team].tolist()
            #last-k-conceded
            prev_home_conceded = prev_home['away_team_goal'][subset['home_team_api_id'] == team].tolist()
            prev_away_conceded = prev_away['home_team_goal'][subset['away_team_api_id'] == team].tolist()            
            
            home_team_last_k_scored.append(sum(prev_home_scored)+sum(prev_away_scored))
            home_team_last_k_conceded.append(sum(prev_home_conceded)+sum(prev_away_conceded))
            
        for team in away_teams:
            prev_home = subset[subset['home_team_api_id'] == team]
            prev_away = subset[subset['away_team_api_id'] == team]
            
            #last-k-scored
            prev_home_scored = prev_home['home_team_goal'][subset['home_team_api_id'] == team].tolist()
            prev_away_scored = prev_away['away_team_goal'][subset['away_team_api_id'] == team].tolist()
            #last-k-conceded
            prev_home_conceded = prev_home['away_team_goal'][subset['home_team_api_id'] == team].tolist()
            prev_away_conceded = prev_away['home_team_goal'][subset['away_team_api_id'] == team].tolist()            
            
            away_team_last_k_scored.append(sum(prev_home_scored)+sum(prev_away_scored))
            away_team_last_k_conceded.append(sum(prev_home_conceded)+sum(prev_away_conceded))
    
    last_k_df = pd.DataFrame({'home_team_last_k_scored':home_team_last_k_scored,\
                                  'home_team_last_k_conceded': home_team_last_k_conceded,\
                                  'away_team_last_k_scored': away_team_last_k_scored,\
                                  'away_team_last_k_conceded':away_team_last_k_conceded,\
                                  'match_api_id':match_api_id})
    
    
    
    return last_k_df



In [21]:

    
def CreateLastKFeature(df,lastk):
    seasons = df['season'].unique()
    
    count = 0
    
    for year in seasons:
    
        matches_season = matches[matches['season'] == year]
    
        last_k_goals_df = LastKGoals(matches_season,lastk)
    
        if count == 0:
            home_k_scored = last_k_goals_df['home_team_last_k_scored'].tolist()
            home_k_conceded = last_k_goals_df['home_team_last_k_conceded'].tolist()
            away_k_scored = last_k_goals_df['away_team_last_k_scored'].tolist()
            away_k_conceded = last_k_goals_df['away_team_last_k_conceded'].tolist()
            match_id = last_k_goals_df['match_api_id'].tolist()
        else:
            home_k_scored.extend(last_k_goals_df['home_team_last_k_scored'].tolist())
            home_k_conceded.extend(last_k_goals_df['home_team_last_k_conceded'].tolist())
            away_k_scored.extend(last_k_goals_df['away_team_last_k_scored'].tolist())
            away_k_conceded.extend(last_k_goals_df['away_team_last_k_conceded'].tolist())
            match_id.extend(last_k_goals_df['match_api_id'].tolist())
    
        count += 1
        
        last_k_df = pd.DataFrame({'home_last_%d_scored'%(lastk):home_k_scored,\
                          'home_last_%d_conceded'%(lastk): home_k_conceded,\
                          'away_last_%d_scored'%(lastk): away_k_scored,\
                          'away_last_%d_concede'%(lastk): away_k_conceded,\
                          'match_api_id': match_id})
        
        
    return last_k_df



In [22]:

    
## Create features for k = 2,3,4,5

start = time.time()
for k_value in range(2,6):
    matches = matches.merge(CreateLastKFeature(matches,k_value), how='inner', on = 'match_api_id')

end = time.time()

diff = end-start

print 'Adding new k features took ' + str(round(diff,2)) + ' seconds'
print matches.shape









    



Adding new k features took 113.53 seconds
(3040, 47)



In [23]:

    
matches.tail()









    Out[23]:






  
    
      
      id
      season
      stage
      date
      match_api_id
      home_team_api_id
      away_team_api_id
      home_team_goal
      away_team_goal
      B365H
      ...
      home_last_3_conceded
      home_last_3_scored
      away_last_4_concede
      away_last_4_scored
      home_last_4_conceded
      home_last_4_scored
      away_last_5_concede
      away_last_5_scored
      home_last_5_conceded
      home_last_5_scored
    
  
  
    
      3035
      4764
      2015/2016
      9
      10/17/15 0:00
      1988795
      8466
      8197
      2
      2
      1.75
      ...
      5
      8
      10
      9
      5
      8
      11
      10
      5
      11
    
    
      3036
      4765
      2015/2016
      9
      10/19/15 0:00
      1988796
      10003
      10194
      0
      1
      2.05
      ...
      5
      3
      5
      5
      6
      3
      6
      5
      7
      5
    
    
      3037
      4766
      2015/2016
      9
      10/17/15 0:00
      1988797
      8586
      8650
      0
      0
      2.50
      ...
      3
      7
      7
      6
      3
      8
      10
      6
      3
      8
    
    
      3038
      4767
      2015/2016
      9
      10/17/15 0:00
      1988798
      9817
      9825
      0
      3
      6.00
      ...
      3
      3
      4
      10
      3
      4
      4
      11
      5
      4
    
    
      3039
      4768
      2015/2016
      9
      10/17/15 0:00
      1988799
      8659
      8472
      1
      0
      2.05
      ...
      5
      3
      8
      2
      5
      3
      10
      4
      5
      4
    
  

5 rows × 47 columns

Lets check the results and make sure there aren't any missing values



In [24]:

    
null_dict = {}

for col in matches.columns:
    nulls = matches[col].isnull().sum()
    if nulls > 0:
        null_dict[col] = nulls

null_dict









    Out[24]:





{}

Great! There aren't any nulls



In [25]:

    
#Let's do some investigation on the new features

cols = matches.columns[31:]

for col in cols:
    matches[col].plot(kind='hist')
    plt.title(col)
    plt.show()

This concludes the 'last-k-goals' features!

Day of the week

The goal of this feature is to append a categorical feature 'Day of the Week' to each match. Because some matches happen during the middle of the week, maybe that could provide some information to the model.



In [26]:

    
dates_raw = matches['date'].tolist()
dates_split = []
for date in dates_raw:
    dates_split.append(date.split(' ')[0])

dates_split[0]









    Out[26]:





'8/17/08'



In [27]:

    
day_of_week = map(lambda x: time.strptime(x,'%m/%d/%y').tm_wday ,dates_split)

matches['day_of_week'] = day_of_week



In [28]:

    
matches['day_of_week'].plot(kind='hist')   #Most matches played on saturday (day of week =5), but matches are also played on other days









    Out[28]:





<matplotlib.axes._subplots.AxesSubplot at 0x11994cb10>



In [29]:

    
matches['day_of_week'].value_counts()   #Most matches played on saturday (day of week =5), but matches are also played on other days









    Out[29]:





5    1718
6     736
2     230
0     161
1     150
4      23
3      22
Name: day_of_week, dtype: int64



In [30]:

    
# Let's re-investigate our matches dataframe

matches.tail()









    Out[30]:






  
    
      
      id
      season
      stage
      date
      match_api_id
      home_team_api_id
      away_team_api_id
      home_team_goal
      away_team_goal
      B365H
      ...
      home_last_3_scored
      away_last_4_concede
      away_last_4_scored
      home_last_4_conceded
      home_last_4_scored
      away_last_5_concede
      away_last_5_scored
      home_last_5_conceded
      home_last_5_scored
      day_of_week
    
  
  
    
      3035
      4764
      2015/2016
      9
      10/17/15 0:00
      1988795
      8466
      8197
      2
      2
      1.75
      ...
      8
      10
      9
      5
      8
      11
      10
      5
      11
      5
    
    
      3036
      4765
      2015/2016
      9
      10/19/15 0:00
      1988796
      10003
      10194
      0
      1
      2.05
      ...
      3
      5
      5
      6
      3
      6
      5
      7
      5
      0
    
    
      3037
      4766
      2015/2016
      9
      10/17/15 0:00
      1988797
      8586
      8650
      0
      0
      2.50
      ...
      7
      7
      6
      3
      8
      10
      6
      3
      8
      5
    
    
      3038
      4767
      2015/2016
      9
      10/17/15 0:00
      1988798
      9817
      9825
      0
      3
      6.00
      ...
      3
      4
      10
      3
      4
      4
      11
      5
      4
      5
    
    
      3039
      4768
      2015/2016
      9
      10/17/15 0:00
      1988799
      8659
      8472
      1
      0
      2.05
      ...
      3
      8
      2
      5
      3
      10
      4
      5
      4
      5
    
  

5 rows × 48 columns



In [31]:

    
day_of_week_dummies = pd.get_dummies(matches['day_of_week'])
day_of_week_dummies.columns = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']

#Add new binary columns to matches dataframe

matches['Monday'] = day_of_week_dummies.Monday
matches['Tuesday'] = day_of_week_dummies.Tuesday
matches['Wednesday'] = day_of_week_dummies.Wednesday
matches['Thursday'] = day_of_week_dummies.Thursday
matches['Friday'] = day_of_week_dummies.Friday
matches['Saturday'] = day_of_week_dummies.Saturday
matches['Sunday'] = day_of_week_dummies.Sunday

del matches['day_of_week']



In [32]:

    
matches.head()









    Out[32]:






  
    
      
      id
      season
      stage
      date
      match_api_id
      home_team_api_id
      away_team_api_id
      home_team_goal
      away_team_goal
      B365H
      ...
      away_last_5_scored
      home_last_5_conceded
      home_last_5_scored
      Monday
      Tuesday
      Wednesday
      Thursday
      Friday
      Saturday
      Sunday
    
  
  
    
      0
      1729
      2008/2009
      1
      8/17/08 0:00
      489042
      10260
      10261
      1
      1
      1.29
      ...
      -1
      -1
      -1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      1
      1730
      2008/2009
      1
      8/16/08 0:00
      489043
      9825
      8659
      1
      0
      1.20
      ...
      -1
      -1
      -1
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
    
    
      2
      1731
      2008/2009
      1
      8/16/08 0:00
      489044
      8472
      8650
      0
      1
      5.50
      ...
      -1
      -1
      -1
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
    
    
      3
      1732
      2008/2009
      1
      8/16/08 0:00
      489045
      8654
      8528
      2
      1
      1.91
      ...
      -1
      -1
      -1
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
    
    
      4
      1733
      2008/2009
      1
      8/17/08 0:00
      489046
      10252
      8456
      4
      2
      1.91
      ...
      -1
      -1
      -1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
  

5 rows × 54 columns

Now let's add some information about each team



In [33]:

    
teams = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_teams.csv')
print teams.shape
teams.head()









    



(34, 6)






    Out[33]:






  
    
      
      id
      team_api_id
      team_fifa_api_id
      team_long_name
      team_short_name
      ID
    
  
  
    
      0
      3457
      10260
      11
      Manchester United
      MUN
      10260
    
    
      1
      3458
      10261
      13
      Newcastle United
      NEW
      10261
    
    
      2
      3459
      9825
      1
      Arsenal
      ARS
      9825
    
    
      3
      3460
      8659
      109
      West Bromwich Albion
      WBA
      8659
    
    
      4
      3461
      8472
      106
      Sunderland
      SUN
      8472



In [34]:

    
#Let's reduce the dataframe only to the columns we need
keepcols = ['team_api_id','team_long_name']

for col in teams.columns:
    if col not in keepcols:
        del teams[col]

teams.head()









    Out[34]:






  
    
      
      team_api_id
      team_long_name
    
  
  
    
      0
      10260
      Manchester United
    
    
      1
      10261
      Newcastle United
    
    
      2
      9825
      Arsenal
    
    
      3
      8659
      West Bromwich Albion
    
    
      4
      8472
      Sunderland



In [35]:

    
matches = matches.merge(teams,how = 'left', left_on = 'home_team_api_id', right_on = 'team_api_id' )
matches = matches.merge(teams,how = 'left', left_on = 'away_team_api_id', right_on = 'team_api_id')

matches.head()









    Out[35]:






  
    
      
      id
      season
      stage
      date
      match_api_id
      home_team_api_id
      away_team_api_id
      home_team_goal
      away_team_goal
      B365H
      ...
      Tuesday
      Wednesday
      Thursday
      Friday
      Saturday
      Sunday
      team_api_id_x
      team_long_name_x
      team_api_id_y
      team_long_name_y
    
  
  
    
      0
      1729
      2008/2009
      1
      8/17/08 0:00
      489042
      10260
      10261
      1
      1
      1.29
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      10260
      Manchester United
      10261
      Newcastle United
    
    
      1
      1730
      2008/2009
      1
      8/16/08 0:00
      489043
      9825
      8659
      1
      0
      1.20
      ...
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      9825
      Arsenal
      8659
      West Bromwich Albion
    
    
      2
      1731
      2008/2009
      1
      8/16/08 0:00
      489044
      8472
      8650
      0
      1
      5.50
      ...
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      8472
      Sunderland
      8650
      Liverpool
    
    
      3
      1732
      2008/2009
      1
      8/16/08 0:00
      489045
      8654
      8528
      2
      1
      1.91
      ...
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      8654
      West Ham United
      8528
      Wigan Athletic
    
    
      4
      1733
      2008/2009
      1
      8/17/08 0:00
      489046
      10252
      8456
      4
      2
      1.91
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      10252
      Aston Villa
      8456
      Manchester City
    
  

5 rows × 58 columns



In [36]:

    
remove_cols = ['team_api_id_x','team_api_id_y']

for col in matches.columns:
    if col in remove_cols:
        del matches[col]
        
matches.rename(columns={'team_long_name_x': 'Home_Team', 'team_long_name_y': 'Away_Team'}, inplace=True)



In [37]:

    
matches.season.value_counts()









    Out[37]:





2009/2010    380
2013/2014    380
2010/2011    380
2011/2012    380
2012/2013    380
2014/2015    380
2008/2009    380
2015/2016    380
Name: season, dtype: int64



In [38]:

    
print matches.head()
print matches.shape  # We now have 56 total columns (some of them we will get rid of..)









    



     id     season  stage          date  match_api_id  home_team_api_id  \
0  1729  2008/2009      1  8/17/08 0:00        489042             10260   
1  1730  2008/2009      1  8/16/08 0:00        489043              9825   
2  1731  2008/2009      1  8/16/08 0:00        489044              8472   
3  1732  2008/2009      1  8/16/08 0:00        489045              8654   
4  1733  2008/2009      1  8/17/08 0:00        489046             10252   

   away_team_api_id  home_team_goal  away_team_goal  B365H  \
0             10261               1               1   1.29   
1              8659               1               0   1.20   
2              8650               0               1   5.50   
3              8528               2               1   1.91   
4              8456               4               2   1.91   

           ...           home_last_5_scored  Monday  Tuesday  Wednesday  \
0          ...                           -1     0.0      0.0        0.0   
1          ...                           -1     0.0      0.0        0.0   
2          ...                           -1     0.0      0.0        0.0   
3          ...                           -1     0.0      0.0        0.0   
4          ...                           -1     0.0      0.0        0.0   

   Thursday  Friday  Saturday  Sunday          Home_Team             Away_Team  
0       0.0     0.0       0.0     1.0  Manchester United      Newcastle United  
1       0.0     0.0       1.0     0.0            Arsenal  West Bromwich Albion  
2       0.0     0.0       1.0     0.0         Sunderland             Liverpool  
3       0.0     0.0       1.0     0.0    West Ham United        Wigan Athletic  
4       0.0     0.0       0.0     1.0        Aston Villa       Manchester City  

[5 rows x 56 columns]
(3040, 56)

Load Team Attributes



In [39]:

    
team_attributes = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_team_attributes.csv')
team_attributes.head()









    Out[39]:






  
    
      
      id
      team_fifa_api_id
      team_api_id
      date
      buildUpPlaySpeed
      buildUpPlaySpeedClass
      buildUpPlayDribbling
      buildUpPlayDribblingClass
      buildUpPlayPassing
      buildUpPlayPassingClass
      ...
      chanceCreationShootingClass
      chanceCreationPositioningClass
      defencePressure
      defencePressureClass
      defenceAggression
      defenceAggressionClass
      defenceTeamWidth
      defenceTeamWidthClass
      defenceDefenderLineClass
      ID
    
  
  
    
      0
      71
      1
      9825
      2/22/10 0:00
      66
      Balanced
      NaN
      Little
      30
      Short
      ...
      Normal
      Free Form
      30
      Deep
      40
      Press
      50
      Normal
      Cover
      9825
    
    
      1
      72
      1
      9825
      2/22/11 0:00
      75
      Fast
      NaN
      Little
      40
      Mixed
      ...
      Normal
      Free Form
      50
      Medium
      40
      Press
      45
      Normal
      Cover
      9825
    
    
      2
      73
      1
      9825
      2/22/12 0:00
      25
      Slow
      NaN
      Little
      32
      Short
      ...
      Little
      Free Form
      57
      Medium
      57
      Press
      52
      Normal
      Cover
      9825
    
    
      3
      74
      1
      9825
      9/20/13 0:00
      30
      Slow
      NaN
      Little
      29
      Short
      ...
      Normal
      Free Form
      64
      Medium
      54
      Press
      52
      Normal
      Cover
      9825
    
    
      4
      75
      1
      9825
      9/19/14 0:00
      59
      Balanced
      51.0
      Normal
      26
      Short
      ...
      Normal
      Free Form
      51
      Medium
      44
      Press
      52
      Normal
      Cover
      9825
    
  

5 rows × 26 columns



In [40]:

    
# Let's see what columns make up this dataframe
team_attributes.columns









    Out[40]:





Index([u'id', u'team_fifa_api_id', u'team_api_id', u'date',
       u'buildUpPlaySpeed', u'buildUpPlaySpeedClass', u'buildUpPlayDribbling',
       u'buildUpPlayDribblingClass', u'buildUpPlayPassing',
       u'buildUpPlayPassingClass', u'buildUpPlayPositioningClass',
       u'chanceCreationPassing', u'chanceCreationPassingClass',
       u'chanceCreationCrossing', u'chanceCreationCrossingClass',
       u'chanceCreationShooting', u'chanceCreationShootingClass',
       u'chanceCreationPositioningClass', u'defencePressure',
       u'defencePressureClass', u'defenceAggression',
       u'defenceAggressionClass', u'defenceTeamWidth',
       u'defenceTeamWidthClass', u'defenceDefenderLineClass', u'ID'],
      dtype='object')



In [41]:

    
# Remove columns that we won't use
remove_cols = ['buildUpPlaySpeedClass','buildUpPlayDribbling','buildUpPlayPassingClass','chanceCreationPassingClass',\
               'chanceCreationCrossingClass','chanceCreationShootingClass','defencePressureClass','defenceAggressionClass',\
               'defenceTeamWidthClass','ID']

for col in team_attributes.columns:
    if col in remove_cols:
        del team_attributes[col]
        
team_attributes.shape ## Now we have just 16 columns









    Out[41]:





(204, 16)



In [42]:

    
# Split dates to get just the date component
dates_raw = team_attributes['date'].tolist()
dates_split = []
for date in dates_raw:
    dates_split.append(date.split(' ')[0])



In [43]:

    
# Extract the year and month from each record and append to team_attributes dataframe
team_attributes['month'] = map(lambda x: time.strptime(x,'%m/%d/%y').tm_mon ,dates_split)
team_attributes['year'] = map(lambda x: time.strptime(x,'%m/%d/%y').tm_year ,dates_split)



In [44]:

    
def DetermineSeason(df):
    if df['year'] == 2008:
        return '2008/2009'
    if df['year'] == 2009:
        if df['month'] < 8:
            return '2008/2009'
        else:
            return '2009/2010'
    if df['year'] == 2010:
        if df['month'] < 8:
            return '2009/2010'
        else:
            return '2010/2011'
    if df['year'] == 2011:
        if df['month'] < 8:
            return '2010/2011'
        else:
            return '2011/2012'
    if df['year'] == 2012:
        if df['month'] < 8:
            return '2011/2012'
        else:
            return '2012/2013'
    if df['year'] == 2013:
        if df['month'] < 8:
            return '2012/2013'
        else:
            return '2013/2014'
    if df['year'] == 2014:
        if df['month'] < 8:
            return '2013/2014'
        else:
            return '2014/2015'
    if df['year'] == 2015:
        if df['month'] < 8:
            return '2014/2015'
        else:
            return '2015/2016'
    if df['year'] == 2016:
        if df['month'] < 8:
            return '2015/2016'
        else:
            return '2016/2017'



In [45]:

    
team_attributes['season'] = team_attributes.apply(DetermineSeason, axis = 1)



In [46]:

    
team_attributes.head(20)









    Out[46]:






  
    
      
      id
      team_fifa_api_id
      team_api_id
      date
      buildUpPlaySpeed
      buildUpPlayDribblingClass
      buildUpPlayPassing
      buildUpPlayPositioningClass
      chanceCreationPassing
      chanceCreationCrossing
      chanceCreationShooting
      chanceCreationPositioningClass
      defencePressure
      defenceAggression
      defenceTeamWidth
      defenceDefenderLineClass
      month
      year
      season
    
  
  
    
      0
      71
      1
      9825
      2/22/10 0:00
      66
      Little
      30
      Free Form
      30
      45
      35
      Free Form
      30
      40
      50
      Cover
      2
      2010
      2009/2010
    
    
      1
      72
      1
      9825
      2/22/11 0:00
      75
      Little
      40
      Free Form
      40
      45
      65
      Free Form
      50
      40
      45
      Cover
      2
      2011
      2010/2011
    
    
      2
      73
      1
      9825
      2/22/12 0:00
      25
      Little
      32
      Organised
      41
      34
      30
      Free Form
      57
      57
      52
      Cover
      2
      2012
      2011/2012
    
    
      3
      74
      1
      9825
      9/20/13 0:00
      30
      Little
      29
      Organised
      41
      55
      39
      Free Form
      64
      54
      52
      Cover
      9
      2013
      2013/2014
    
    
      4
      75
      1
      9825
      9/19/14 0:00
      59
      Normal
      26
      Organised
      28
      55
      64
      Free Form
      51
      44
      52
      Cover
      9
      2014
      2014/2015
    
    
      5
      76
      1
      9825
      9/10/15 0:00
      59
      Normal
      30
      Organised
      28
      44
      46
      Free Form
      51
      44
      52
      Cover
      9
      2015
      2015/2016
    
    
      6
      77
      2
      10252
      2/22/10 0:00
      70
      Little
      59
      Organised
      65
      70
      50
      Free Form
      30
      70
      30
      Cover
      2
      2010
      2009/2010
    
    
      7
      78
      2
      10252
      2/22/11 0:00
      65
      Little
      50
      Organised
      45
      55
      50
      Organised
      35
      45
      65
      Cover
      2
      2011
      2010/2011
    
    
      8
      79
      2
      10252
      2/22/12 0:00
      67
      Little
      58
      Organised
      44
      60
      56
      Organised
      43
      50
      52
      Cover
      2
      2012
      2011/2012
    
    
      9
      80
      2
      10252
      9/20/13 0:00
      63
      Little
      59
      Organised
      52
      55
      34
      Organised
      39
      41
      54
      Cover
      9
      2013
      2013/2014
    
    
      10
      81
      2
      10252
      9/19/14 0:00
      66
      Little
      72
      Organised
      63
      48
      56
      Organised
      39
      42
      52
      Cover
      9
      2014
      2014/2015
    
    
      11
      82
      2
      10252
      9/10/15 0:00
      63
      Normal
      54
      Organised
      60
      48
      38
      Organised
      35
      44
      54
      Cover
      9
      2015
      2015/2016
    
    
      12
      167
      88
      8658
      2/22/10 0:00
      70
      Little
      70
      Organised
      70
      70
      70
      Organised
      70
      70
      70
      Cover
      2
      2010
      2009/2010
    
    
      13
      168
      88
      8658
      2/22/11 0:00
      45
      Little
      60
      Organised
      35
      50
      40
      Organised
      30
      60
      45
      Cover
      2
      2011
      2010/2011
    
    
      14
      169
      88
      8658
      2/22/12 0:00
      51
      Little
      55
      Organised
      41
      62
      56
      Organised
      42
      48
      54
      Cover
      2
      2012
      2011/2012
    
    
      15
      170
      88
      8658
      9/20/13 0:00
      56
      Little
      57
      Organised
      45
      71
      41
      Organised
      42
      44
      54
      Cover
      9
      2013
      2013/2014
    
    
      16
      171
      88
      8658
      9/19/14 0:00
      56
      Normal
      79
      Organised
      77
      71
      56
      Organised
      42
      48
      54
      Cover
      9
      2014
      2014/2015
    
    
      17
      172
      88
      8658
      9/10/15 0:00
      56
      Little
      75
      Organised
      77
      62
      64
      Organised
      42
      47
      54
      Cover
      9
      2015
      2015/2016
    
    
      18
      173
      3
      8655
      2/22/10 0:00
      55
      Little
      70
      Organised
      60
      70
      55
      Organised
      45
      70
      45
      Cover
      2
      2010
      2009/2010
    
    
      19
      174
      3
      8655
      2/22/11 0:00
      55
      Little
      65
      Organised
      50
      50
      35
      Organised
      45
      60
      55
      Cover
      2
      2011
      2010/2011



In [47]:

    
team_attributes['season'].unique()   # By using team attributes, it eliminates data from 2008/2009 and 2012/2013 season! shoot..









    Out[47]:





array(['2009/2010', '2010/2011', '2011/2012', '2013/2014', '2014/2015',
       '2015/2016'], dtype=object)



In [48]:

    
team_attributes['season'].value_counts()









    Out[48]:





2014/2015    34
2009/2010    34
2010/2011    34
2015/2016    34
2013/2014    34
2011/2012    34
Name: season, dtype: int64



In [49]:

    
## Let's test merging the team features with the matches dataframe

matches = matches.merge(team_attributes, how = 'inner', left_on = ['home_team_api_id','season'], right_on = \
                  ['team_api_id','season'])

matches.head()









    Out[49]:






  
    
      
      id_x
      season
      stage
      date_x
      match_api_id
      home_team_api_id
      away_team_api_id
      home_team_goal
      away_team_goal
      B365H
      ...
      chanceCreationPassing
      chanceCreationCrossing
      chanceCreationShooting
      chanceCreationPositioningClass
      defencePressure
      defenceAggression
      defenceTeamWidth
      defenceDefenderLineClass
      month
      year
    
  
  
    
      0
      2109
      2009/2010
      1
      8/16/09 0:00
      658571
      10260
      8658
      1
      0
      1.20
      ...
      45
      70
      65
      Free Form
      40
      50
      40
      Cover
      2
      2010
    
    
      1
      2130
      2009/2010
      11
      10/31/09 0:00
      658710
      10260
      8655
      2
      0
      1.17
      ...
      45
      70
      65
      Free Form
      40
      50
      40
      Cover
      2
      2010
    
    
      2
      2156
      2009/2010
      13
      11/21/09 0:00
      658796
      10260
      8668
      3
      0
      1.40
      ...
      45
      70
      65
      Free Form
      40
      50
      40
      Cover
      2
      2010
    
    
      3
      2179
      2009/2010
      16
      12/12/09 0:00
      658819
      10260
      10252
      0
      1
      1.50
      ...
      45
      70
      65
      Free Form
      40
      50
      40
      Cover
      2
      2010
    
    
      4
      2192
      2009/2010
      17
      12/15/09 0:00
      658832
      10260
      8602
      3
      0
      1.17
      ...
      45
      70
      65
      Free Form
      40
      50
      40
      Cover
      2
      2010
    
  

5 rows × 74 columns



In [50]:

    
matches = matches.merge(team_attributes, how = 'inner', left_on = ['away_team_api_id','season'], right_on = \
                  ['team_api_id','season'])



In [51]:

    
matches.shape  # We have reduced the rows from 3040 to 2280...lost 25% of the data!.. We lose 2008/2009 and 2012/2013









    Out[51]:





(2280, 92)



In [52]:

    
matches.season.value_counts()









    Out[52]:





2009/2010    380
2013/2014    380
2010/2011    380
2011/2012    380
2014/2015    380
2015/2016    380
Name: season, dtype: int64

Stage to categorical



In [ ]:

    
def Stage2Cat(df):
    if df['stage'] < 22:
        return 'Before Transfer Window'
    else:
        return 'After Transfer Window'



In [ ]:

    
matches['Stage_Cat'] = matches.apply(Stage2Cat, axis = 1)



In [ ]:

    
stage_cat_dummies = pd.get_dummies(matches['Stage_Cat'])
del matches['Stage_Cat']
matches = pd.concat([matches,stage_cat_dummies], axis = 1)

Promoted Teams Flag

The goal of this feature is to have a flag that indicates if a team was newly promoted that season



In [57]:

    
promoted_teams = {'2009/2010': ['Wolverhampton Wanderers','Birmingham City','Burnley'],\
                  '2010/2011': ['Newcastle United','West Bromwich Albion','Blackpool'],\
                  '2011/2012': ['Queens Park Rangers','Norwich City','Swansea City'],\
                  '2012/2013': ['Reading','Southampton','West Ham United'],\
                  '2013/2014': ['Cardiff City','Hull City','Crystal Palace'],\
                  '2014/2015': ['Leicester City','Burnley','Queens Park Rangers'],\
                  '2015/2016': ['AFC Bournemouth','Watford','Norwich City']}



In [58]:

    
def HomePromotedTeam(df):
    seasonlist = matches['season'].unique().tolist()
    for season in seasonlist:
        if df['Home_Team'] in promoted_teams[season]:
            return 1
        else:
            return 0
def AwayPromotedTeam(df):
    seasonlist = matches['season'].unique().tolist()
    for season in seasonlist:
        if df['Away_Team'] in promoted_teams[season]:
            return 1
        else:
            return 0



In [59]:

    
matches['Home_Promoted_Flag'] = matches.apply(HomePromotedTeam, axis=1)
matches['Away_Promoted_Flat'] = matches.apply(AwayPromotedTeam, axis = 1)



In [61]:

    
matches.shape









    Out[61]:





(2280, 95)

Now we need to convert features to numeric or dummies to use in the modeling process



In [58]:

    
for col in matches.columns:
    if matches[col].dtype == 'object':
        print col









    



season
date_x
Home_Team
Away_Team
date_y
buildUpPlayDribblingClass_x
buildUpPlayPositioningClass_x
chanceCreationPositioningClass_x
defenceDefenderLineClass_x
date
buildUpPlayDribblingClass_y
buildUpPlayPositioningClass_y
chanceCreationPositioningClass_y
defenceDefenderLineClass_y



In [59]:

    
def CreateDummies(df,col):
    dummies = pd.get_dummies(df[col])
    del df[col]
    if col == 'Home_Team':
        dummies.columns = dummies.columns +'_home_team'
    elif col == 'Away_Team':
        dummies.columns = dummies.columns +'_away_team'
    else:
        dummies.columns = dummies.columns + '_'+col
    df = pd.concat([df,dummies], axis=1)
    return df



In [60]:

    
dummy_cols = ['season','Home_Team','Away_Team','buildUpPlayDribblingClass_x',\
              'buildUpPlayPositioningClass_x', 'chanceCreationPositioningClass_x',\
              'defenceDefenderLineClass_x','buildUpPlayDribblingClass_y','buildUpPlayPositioningClass_y',\
              'chanceCreationPositioningClass_y','defenceDefenderLineClass_y']



In [61]:

    
for dcol in dummy_cols:
    matches = CreateDummies(matches,dcol)

matches.shape









    Out[61]:





(2280, 168)



In [62]:

    
for col in matches.columns:
    if matches[col].dtype == 'object':
        print col









    



date_x
date_y
date



In [63]:

    
# Let's remove the date columns that we won't be using anymore
for col in matches.columns:
    if 'date' in col:
        del matches[col]

matches.shape









    Out[63]:





(2280, 165)

Last step - Remove all features that we don't want to use



In [64]:

    
matches.columns[60:70]









    Out[64]:





Index([u'defenceAggression_x', u'defenceTeamWidth_x', u'month_x', u'year_x',
       u'id', u'team_fifa_api_id_y', u'team_api_id_y', u'buildUpPlaySpeed_y',
       u'buildUpPlayPassing_y', u'chanceCreationPassing_y'],
      dtype='object')



In [65]:

    
remove_cols = ['id_x','match_api_id','home_team_api_id','away_team_api_id',\
               'month_x','year_x','id','team_fifa_api_id_y','team_api_id_y',\
               'id_y','team_fifa_api_id_x','team_api_id_x','month_y','year_y']

for col in remove_cols:
    try:
        del matches[col]
    except Exception:
        continue
    
matches.shape









    Out[65]:





(2280, 151)



In [66]:

    
# Finally export matches dataframe for modeling

matches.to_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/20170127_Matches_w_Features.csv',index=None)



In [67]:

    
from scipy.stats import pearsonr
pearsonr(matches['home_last_4_scored'],matches['Result_Target'])









    Out[67]:





(0.14756617452432319, 1.4299129510091769e-12)

Data Analysis



In [91]:

    
metric_list = ['home_last_%d_scored','home_last_%d_conceded','away_last_%d_scored','away_last_%d_concede']
count = 0
for metric in metric_list:
    corr_list = []
    count +=1
    for i in range(2,6):
        corr_list.append(pearsonr(matches[metric%i],matches['Result_Target'])[0])
    plt.plot(range(2,6),corr_list)
    plt.xlabel('Number of Last K Matches')
    plt.ylabel('Correlation with Result')
    if count == 1:
        plt.title('Home Last K Scored')
    if count == 2:
        plt.title('Home Last K Conceded')
    if count == 3:
        plt.title('Away Last K Scored')
    if count == 4:
        plt.title('Away Last K Conceded')
        
    plt.show()



In [94]:

    
metric_list = ['Home_Promoted_Flag', 'Away_Promoted_Flat']
count = 0
corr_list = []
for metric in metric_list:
    corr_list.append(pearsonr(matches[metric],matches['Result_Target'])[0])



In [102]:

    
print 'Home Promoted Flag Correlation with Target= ' + str(round(corr_list[0],2))
print 'Away Promoted Flag Correlation with Target= ' + str(round(corr_list[1],2))









    



Home Promoted Flag Correlation with Target= -0.07
Away Promoted Flag Correlation with Target= 0.09



In [ ]:

	id	season	stage	date	match_api_id	home_team_api_id	away_team_api_id	home_team_goal	away_team_goal	B365H	...	IWA	LBH	LBD	LBA	WHH	WHD	WHA	VCH	VCD	VCA
0	1729	2008/2009	1	8/17/08 0:00	489042	10260	10261	1	1	1.29	...	8.50	1.25	4.5	10.00	1.25	4.5	10.00	1.28	5.5	12.00
1	1730	2008/2009	1	8/16/08 0:00	489043	9825	8659	1	0	1.20	...	11.00	1.20	5.0	11.00	1.17	5.5	12.00	1.25	6.0	13.00
2	1731	2008/2009	1	8/16/08 0:00	489044	8472	8650	0	1	5.50	...	1.65	4.50	3.3	1.67	5.50	3.3	1.57	5.50	3.8	1.65
3	1732	2008/2009	1	8/16/08 0:00	489045	8654	8528	2	1	1.91	...	3.80	1.80	3.2	4.00	1.83	3.2	3.75	1.90	3.5	4.35
4	1733	2008/2009	1	8/17/08 0:00	489046	10252	8456	4	2	1.91	...	3.30	1.83	3.2	3.75	1.91	3.2	3.50	1.90	3.5	4.35

	B365H	B365D	B365A	BWH	BWD	BWA	IWH	IWD	IWA	LBH	LBD	LBA	WHH	WHD	WHA	VCH	VCD	VCA
0	1.29	5.5	11.00	1.30	4.75	8.25	1.3	4.4	8.50	1.25	4.5	10.00	1.25	4.5	10.00	1.28	5.5	12.00
1	1.20	6.5	15.00	1.22	5.50	10.00	1.2	5.2	11.00	1.20	5.0	11.00	1.17	5.5	12.00	1.25	6.0	13.00
2	5.50	3.6	1.67	5.00	3.35	1.67	4.5	3.5	1.65	4.50	3.3	1.67	5.50	3.3	1.57	5.50	3.8	1.65
3	1.91	3.4	4.20	1.90	3.20	3.80	1.8	3.3	3.80	1.80	3.2	4.00	1.83	3.2	3.75	1.90	3.5	4.35
4	1.91	3.4	4.33	1.95	3.20	3.60	2.0	3.2	3.30	1.83	3.2	3.75	1.91	3.2	3.50	1.90	3.5	4.35

	id	season	stage	date	match_api_id	home_team_api_id	away_team_api_id	home_team_goal	away_team_goal	B365H	...	home_last_3_conceded	home_last_3_scored	away_last_4_concede	away_last_4_scored	home_last_4_conceded	home_last_4_scored	away_last_5_concede	away_last_5_scored	home_last_5_conceded	home_last_5_scored
3035	4764	2015/2016	9	10/17/15 0:00	1988795	8466	8197	2	2	1.75	...	5	8	10	9	5	8	11	10	5	11
3036	4765	2015/2016	9	10/19/15 0:00	1988796	10003	10194	0	1	2.05	...	5	3	5	5	6	3	6	5	7	5
3037	4766	2015/2016	9	10/17/15 0:00	1988797	8586	8650	0	0	2.50	...	3	7	7	6	3	8	10	6	3	8
3038	4767	2015/2016	9	10/17/15 0:00	1988798	9817	9825	0	3	6.00	...	3	3	4	10	3	4	4	11	5	4
3039	4768	2015/2016	9	10/17/15 0:00	1988799	8659	8472	1	0	2.05	...	5	3	8	2	5	3	10	4	5	4

	id	team_api_id	team_fifa_api_id	team_long_name	team_short_name	ID
0	3457	10260	11	Manchester United	MUN	10260
1	3458	10261	13	Newcastle United	NEW	10261
2	3459	9825	1	Arsenal	ARS	9825
3	3460	8659	109	West Bromwich Albion	WBA	8659
4	3461	8472	106	Sunderland	SUN	8472

	id	team_fifa_api_id	team_api_id	date	buildUpPlaySpeed	buildUpPlaySpeedClass	buildUpPlayDribbling	buildUpPlayDribblingClass	buildUpPlayPassing	buildUpPlayPassingClass	...	chanceCreationShootingClass	chanceCreationPositioningClass	defencePressure	defencePressureClass	defenceAggression	defenceAggressionClass	defenceTeamWidth	defenceTeamWidthClass	defenceDefenderLineClass	ID
0	71	1	9825	2/22/10 0:00	66	Balanced	NaN	Little	30	Short	...	Normal	Free Form	30	Deep	40	Press	50	Normal	Cover	9825
1	72	1	9825	2/22/11 0:00	75	Fast	NaN	Little	40	Mixed	...	Normal	Free Form	50	Medium	40	Press	45	Normal	Cover	9825
2	73	1	9825	2/22/12 0:00	25	Slow	NaN	Little	32	Short	...	Little	Free Form	57	Medium	57	Press	52	Normal	Cover	9825
3	74	1	9825	9/20/13 0:00	30	Slow	NaN	Little	29	Short	...	Normal	Free Form	64	Medium	54	Press	52	Normal	Cover	9825
4	75	1	9825	9/19/14 0:00	59	Balanced	51.0	Normal	26	Short	...	Normal	Free Form	51	Medium	44	Press	52	Normal	Cover	9825

	id_x	season	stage	date_x	match_api_id	home_team_api_id	away_team_api_id	home_team_goal	away_team_goal	B365H	...	chanceCreationPassing	chanceCreationCrossing	chanceCreationShooting	chanceCreationPositioningClass	defencePressure	defenceAggression	defenceTeamWidth	defenceDefenderLineClass	month	year
0	2109	2009/2010	1	8/16/09 0:00	658571	10260	8658	1	0	1.20	...	45	70	65	Free Form	40	50	40	Cover	2	2010
1	2130	2009/2010	11	10/31/09 0:00	658710	10260	8655	2	0	1.17	...	45	70	65	Free Form	40	50	40	Cover	2	2010
2	2156	2009/2010	13	11/21/09 0:00	658796	10260	8668	3	0	1.40	...	45	70	65	Free Form	40	50	40	Cover	2	2010
3	2179	2009/2010	16	12/12/09 0:00	658819	10260	10252	0	1	1.50	...	45	70	65	Free Form	40	50	40	Cover	2	2010
4	2192	2009/2010	17	12/15/09 0:00	658832	10260	8602	3	0	1.17	...	45	70	65	Free Form	40	50	40	Cover	2	2010