In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sqlite3
import numpy as np
from numpy import random
database = 'database.sqlite'
conn = sqlite3.connect(database)

In [2]:
query = """select * from match where home_player_X1 is not null and 'home_player_Y1' is not null 
and  home_player_X2 is not null and 'home_player_Y2' is not null 
and  home_player_X3 is not null and home_player_Y3 is not null 
and  home_player_X4 is not null and home_player_Y4 is not null 
and  home_player_X5 is not null and home_player_Y5 is not null 
and  home_player_X6 is not null and home_player_Y6 is not null 
and  home_player_X7 is not null and home_player_Y7 is not null 
and  home_player_X8 is not null and home_player_Y8 is not null 
and  home_player_X9 is not null and home_player_Y9 is not null 
and  home_player_X10 is not null and home_player_Y10 is not null 
and  home_player_X11 is not null and home_player_Y11 is not null  
and  away_player_X1 is not null and away_player_Y1 is not null 
and  away_player_X2 is not null and away_player_Y2 is not null 
and  away_player_X3 is not null and away_player_Y3 is not null 
and  away_player_X4 is not null and away_player_Y4 is not null 
and  away_player_X5 is not null and away_player_Y5 is not null 
and  away_player_X6 is not null and away_player_Y6 is not null 
and  away_player_X7 is not null and away_player_Y7 is not null 
and  away_player_X8 is not null and away_player_Y8 is not null 
and  away_player_X9 is not null and away_player_Y9 is not null 
and  away_player_X10 is not null and away_player_Y10 is not null 
and  away_player_X11 is not null and away_player_Y11 is not null 
and home_team_goal is not null and away_team_goal is not null
and home_player_1 is not null 
and home_player_2 is not null  
and home_player_3 is not null  
and home_player_4 is not null  
and home_player_5 is not null  
and home_player_6 is not null  
and home_player_7 is not null  
and home_player_8 is not null 
and home_player_9 is not null  
and home_player_10 is not null  
and home_player_11 is not null  
and away_player_1 is not null  
and away_player_2 is not null  
and away_player_3 is not null  
and away_player_4 is not null  
and away_player_5 is not null  
and away_player_6 is not null  
and away_player_7 is not null  
and away_player_8 is not null  
and away_player_9 is not null  
and away_player_10 is not null  
and away_player_11 is not null  
and  B365H is not null and B365D is not null and B365A is not null;"""
matches = pd.read_sql(query, conn)
matches


Out[2]:
id country_id league_id season stage date match_api_id home_team_api_id away_team_api_id home_team_goal ... SJA VCH VCD VCA GBH GBD GBA BSH BSD BSA
0 146 1 1 2008/2009 24 2009-02-27 00:00:00 493017 8203 9987 2 ... 2.30 2.65 3.25 2.35 2.90 3.25 2.30 2.80 3.20 2.25
1 154 1 1 2008/2009 25 2009-03-08 00:00:00 493025 9984 8342 1 ... 2.25 2.65 3.20 2.35 2.90 3.20 2.30 2.62 3.20 2.38
2 156 1 1 2008/2009 25 2009-03-07 00:00:00 493027 8635 10000 2 ... 8.50 1.30 4.35 8.00 1.35 4.33 8.50 1.36 4.20 7.00
3 163 1 1 2008/2009 26 2009-03-13 00:00:00 493034 8203 8635 2 ... 1.73 4.35 3.30 1.75 4.50 3.40 1.75 4.20 3.30 1.75
4 169 1 1 2008/2009 26 2009-03-14 00:00:00 493040 10000 9999 0 ... 5.00 1.65 3.50 4.50 1.65 3.50 5.00 1.70 3.40 4.33
5 174 1 1 2008/2009 27 2009-03-22 00:00:00 493045 9991 10000 1 ... 4.75 1.60 3.40 5.00 1.65 3.40 5.00 1.62 3.50 5.00
6 177 1 1 2008/2009 27 2009-03-21 00:00:00 493048 9999 8203 1 ... 2.75 2.25 3.25 2.80 2.10 3.25 3.15 2.25 3.20 2.75
7 190 1 1 2008/2009 29 2009-04-12 00:00:00 493061 8635 8342 1 ... 4.75 1.65 3.40 5.00 1.70 3.40 4.50 1.73 3.40 4.20
8 191 1 1 2008/2009 29 2009-04-10 00:00:00 493062 9999 9987 1 ... 2.15 2.80 3.25 2.25 3.20 3.20 2.10 2.80 3.20 2.25
9 220 1 1 2008/2009 31 2009-04-26 00:00:00 493082 9999 9991 1 ... 2.20 3.20 3.20 2.10 3.00 3.25 2.15 2.88 3.25 2.20
10 227 1 1 2008/2009 32 2009-05-02 00:00:00 493089 10000 9985 0 ... 1.62 5.00 3.50 1.60 5.25 3.50 1.60 5.00 3.60 1.57
11 230 1 1 2008/2009 32 2009-05-02 00:00:00 493092 9991 9984 2 ... 8.50 1.35 4.00 8.50 1.35 4.50 7.00 1.33 4.50 7.50
12 232 1 1 2008/2009 32 2009-05-02 00:00:00 493094 10001 9999 1 ... 5.50 1.55 3.50 5.50 1.57 3.60 5.25 1.57 3.60 5.00
13 235 1 1 2008/2009 33 2009-05-09 00:00:00 493097 9985 8342 2 ... 7.50 1.45 3.80 6.50 1.47 3.75 6.50 1.50 3.75 5.50
14 241 1 1 2008/2009 33 2009-05-09 00:00:00 493103 8635 9999 3 ... 16.00 1.14 6.50 13.00 1.15 6.50 13.00 1.17 6.00 13.00
15 243 1 1 2008/2009 33 2009-05-09 00:00:00 493105 9984 10001 2 ... 2.63 2.60 3.25 2.40 2.50 3.30 2.50 2.50 3.20 2.50
16 244 1 1 2008/2009 34 2009-05-16 00:00:00 493106 9987 8635 0 ... 1.45 6.50 4.00 1.44 6.50 3.75 1.45 7.00 4.00 1.40
17 245 1 1 2008/2009 34 2009-05-16 00:00:00 493107 9991 9985 0 ... 1.70 4.50 3.60 1.65 4.50 3.60 1.65 4.50 3.50 1.67
18 309 1 1 2009/2010 1 2009-08-02 00:00:00 665321 9984 9991 1 ... 2.30 2.62 3.25 2.40 2.70 3.20 2.40 2.88 3.10 2.30
19 310 1 1 2009/2010 1 2009-08-01 00:00:00 665322 9994 10000 1 ... 3.30 2.00 3.25 3.40 2.10 3.30 3.10 2.25 3.20 2.80
20 311 1 1 2009/2010 1 2009-08-01 00:00:00 665323 8571 8635 0 ... 1.57 5.50 3.60 1.57 5.25 3.50 1.60 5.50 3.60 1.53
21 314 1 1 2009/2010 10 2009-10-04 00:00:00 665411 8342 8635 4 ... 2.75 2.40 3.25 2.62 2.45 3.20 2.70 2.38 3.20 2.62
22 318 1 1 2009/2010 10 2009-10-03 00:00:00 665417 8203 9994 2 ... 4.00 1.85 3.30 3.75 1.95 3.25 3.75 1.91 3.25 3.60
23 320 1 1 2009/2010 10 2009-10-03 00:00:00 665421 9993 10001 3 ... 4.50 1.73 3.40 4.33 1.75 3.40 4.50 1.75 3.30 4.20
24 321 1 1 2009/2010 11 2009-10-18 00:00:00 665425 8342 9991 1 ... 4.75 1.70 3.50 4.00 1.60 3.50 5.25 1.67 3.40 4.75
25 323 1 1 2009/2010 11 2009-10-18 00:00:00 665427 9987 8203 1 ... 4.33 1.75 3.40 4.33 1.83 3.30 4.00 1.83 3.30 3.80
26 324 1 1 2009/2010 11 2009-10-17 00:00:00 665429 10000 9993 4 ... 3.60 1.90 3.30 3.60 2.05 3.25 3.30 2.00 3.25 3.30
27 325 1 1 2009/2010 11 2009-10-17 00:00:00 665430 9994 9984 1 ... 3.40 2.00 3.25 3.40 2.10 3.20 3.20 2.05 3.25 3.20
28 327 1 1 2009/2010 11 2009-10-17 00:00:00 665435 10001 9985 2 ... 1.67 5.50 3.40 1.60 5.00 3.40 1.65 4.75 3.40 1.67
29 328 1 1 2009/2010 12 2009-10-24 00:00:00 665438 9985 10000 1 ... 6.50 1.44 3.75 6.50 1.40 4.00 7.50 1.44 3.75 6.50
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19570 24492 21484 21484 2015/2016 6 2015-09-27 00:00:00 2030140 9783 8558 3 ... NaN 2.10 3.40 3.90 NaN NaN NaN NaN NaN NaN
19571 24493 21484 21484 2015/2016 6 2015-09-27 00:00:00 2030141 9869 8603 1 ... NaN 2.30 3.30 3.40 NaN NaN NaN NaN NaN NaN
19572 24494 21484 21484 2015/2016 7 2015-10-03 00:00:00 2030142 8302 8634 2 ... NaN 4.10 3.90 1.91 NaN NaN NaN NaN NaN NaN
19573 24496 21484 21484 2015/2016 7 2015-10-02 00:00:00 2030144 9910 8305 0 ... NaN 1.50 4.50 7.50 NaN NaN NaN NaN NaN NaN
19574 24497 21484 21484 2015/2016 7 2015-10-04 00:00:00 2030145 8581 10205 1 ... NaN 4.40 3.60 1.93 NaN NaN NaN NaN NaN NaN
19575 24498 21484 21484 2015/2016 7 2015-10-04 00:00:00 2030146 9906 8633 1 ... NaN 3.13 3.40 2.45 NaN NaN NaN NaN NaN NaN
19576 24499 21484 21484 2015/2016 7 2015-10-03 00:00:00 2030147 9864 8560 3 ... NaN 2.40 3.25 3.30 NaN NaN NaN NaN NaN NaN
19577 24500 21484 21484 2015/2016 7 2015-10-04 00:00:00 2030148 8315 10267 3 ... NaN 2.38 3.40 3.25 NaN NaN NaN NaN NaN NaN
19578 24501 21484 21484 2015/2016 7 2015-10-03 00:00:00 2030149 7878 9783 1 ... NaN 2.63 3.25 3.00 NaN NaN NaN NaN NaN NaN
19579 24502 21484 21484 2015/2016 7 2015-10-03 00:00:00 2030150 8558 9869 1 ... NaN 2.00 3.40 4.30 NaN NaN NaN NaN NaN NaN
19580 24503 21484 21484 2015/2016 7 2015-10-04 00:00:00 2030151 8370 8603 0 ... NaN 2.20 3.80 3.30 NaN NaN NaN NaN NaN NaN
19581 24504 21484 21484 2015/2016 8 2015-10-17 00:00:00 2030152 8634 8370 5 ... NaN 1.15 9.50 19.00 NaN NaN NaN NaN NaN NaN
19582 24505 21484 21484 2015/2016 8 2015-10-17 00:00:00 2030153 8372 8302 1 ... NaN 3.60 3.50 2.20 NaN NaN NaN NaN NaN NaN
19583 24507 21484 21484 2015/2016 8 2015-10-18 00:00:00 2030155 10205 9910 1 ... NaN 2.05 3.70 3.75 NaN NaN NaN NaN NaN NaN
19584 24508 21484 21484 2015/2016 8 2015-10-17 00:00:00 2030156 8633 8581 3 ... NaN 1.08 13.00 34.00 NaN NaN NaN NaN NaN NaN
19585 24509 21484 21484 2015/2016 8 2015-10-18 00:00:00 2030157 8560 9906 0 ... NaN 4.30 3.30 2.05 NaN NaN NaN NaN NaN NaN
19586 24510 21484 21484 2015/2016 8 2015-10-17 00:00:00 2030158 10267 9864 3 ... NaN 1.62 4.10 6.25 NaN NaN NaN NaN NaN NaN
19587 24511 21484 21484 2015/2016 8 2015-10-18 00:00:00 2030159 9783 8315 2 ... NaN 2.70 3.25 2.88 NaN NaN NaN NaN NaN NaN
19588 24512 21484 21484 2015/2016 8 2015-10-19 00:00:00 2030160 9869 7878 3 ... NaN 2.00 3.40 4.33 NaN NaN NaN NaN NaN NaN
19589 24513 21484 21484 2015/2016 8 2015-10-17 00:00:00 2030161 8603 8558 1 ... NaN 2.05 3.50 4.00 NaN NaN NaN NaN NaN NaN
19590 24514 21484 21484 2015/2016 9 2015-10-25 00:00:00 2030162 8634 8372 3 ... NaN 1.11 11.00 31.00 NaN NaN NaN NaN NaN NaN
19591 24515 21484 21484 2015/2016 9 2015-10-24 00:00:00 2030163 8302 8305 5 ... NaN 1.50 4.60 7.50 NaN NaN NaN NaN NaN NaN
19592 24516 21484 21484 2015/2016 9 2015-10-25 00:00:00 2030164 8306 10205 0 ... NaN 3.60 3.40 2.20 NaN NaN NaN NaN NaN NaN
19593 24517 21484 21484 2015/2016 9 2015-10-24 00:00:00 2030165 9910 8633 1 ... NaN 3.90 4.00 1.93 NaN NaN NaN NaN NaN NaN
19594 24518 21484 21484 2015/2016 9 2015-10-25 00:00:00 2030166 8581 8560 0 ... NaN 2.75 3.13 2.90 NaN NaN NaN NaN NaN NaN
19595 24519 21484 21484 2015/2016 9 2015-10-25 00:00:00 2030167 9906 10267 2 ... NaN 1.57 4.00 7.00 NaN NaN NaN NaN NaN NaN
19596 24520 21484 21484 2015/2016 9 2015-10-24 00:00:00 2030168 9864 9783 2 ... NaN 2.30 3.40 3.40 NaN NaN NaN NaN NaN NaN
19597 24521 21484 21484 2015/2016 9 2015-10-26 00:00:00 2030169 8315 9869 3 ... NaN 1.55 4.20 7.00 NaN NaN NaN NaN NaN NaN
19598 24522 21484 21484 2015/2016 9 2015-10-24 00:00:00 2030170 7878 8603 1 ... NaN 2.30 3.40 3.30 NaN NaN NaN NaN NaN NaN
19599 24523 21484 21484 2015/2016 9 2015-10-23 00:00:00 2030171 8370 8558 3 ... NaN 2.20 3.60 3.50 NaN NaN NaN NaN NaN NaN

19600 rows × 115 columns


In [3]:
query = """ select * from Player as p ,Player_Stats as s where p.player_api_id = s.player_api_id;"""
player = pd.read_sql(query, conn)
player[['player_api_id','date_stat']]


Out[3]:
player_api_id player_api_id date_stat
0 505942 505942 2016-02-18 00:00:00
1 505942 505942 2015-11-19 00:00:00
2 505942 505942 2015-09-21 00:00:00
3 505942 505942 2015-03-20 00:00:00
4 505942 505942 2007-02-22 00:00:00
5 155782 155782 2016-04-21 00:00:00
6 155782 155782 2016-04-07 00:00:00
7 155782 155782 2016-01-07 00:00:00
8 155782 155782 2015-12-24 00:00:00
9 155782 155782 2015-12-17 00:00:00
10 155782 155782 2015-10-16 00:00:00
11 155782 155782 2015-09-25 00:00:00
12 155782 155782 2015-09-21 00:00:00
13 155782 155782 2015-01-09 00:00:00
14 155782 155782 2014-12-05 00:00:00
15 155782 155782 2014-11-07 00:00:00
16 155782 155782 2014-09-18 00:00:00
17 155782 155782 2014-05-02 00:00:00
18 155782 155782 2014-04-04 00:00:00
19 155782 155782 2014-03-14 00:00:00
20 155782 155782 2013-12-13 00:00:00
21 155782 155782 2013-11-08 00:00:00
22 155782 155782 2013-10-04 00:00:00
23 155782 155782 2013-09-20 00:00:00
24 155782 155782 2013-05-03 00:00:00
25 155782 155782 2013-03-22 00:00:00
26 155782 155782 2013-03-15 00:00:00
27 155782 155782 2013-02-22 00:00:00
28 155782 155782 2013-02-15 00:00:00
29 155782 155782 2012-08-31 00:00:00
... ... ... ...
183223 108760 108760 2014-09-18 00:00:00
183224 108760 108760 2013-12-06 00:00:00
183225 108760 108760 2013-11-22 00:00:00
183226 108760 108760 2013-09-20 00:00:00
183227 108760 108760 2009-08-30 00:00:00
183228 108760 108760 2007-02-22 00:00:00
183229 39494 39494 2016-04-14 00:00:00
183230 39494 39494 2016-02-11 00:00:00
183231 39494 39494 2015-10-09 00:00:00
183232 39494 39494 2015-09-21 00:00:00
183233 39494 39494 2015-01-09 00:00:00
183234 39494 39494 2014-10-31 00:00:00
183235 39494 39494 2014-09-18 00:00:00
183236 39494 39494 2014-03-28 00:00:00
183237 39494 39494 2014-03-14 00:00:00
183238 39494 39494 2014-02-28 00:00:00
183239 39494 39494 2013-09-20 00:00:00
183240 39494 39494 2013-04-26 00:00:00
183241 39494 39494 2013-02-15 00:00:00
183242 39494 39494 2012-08-31 00:00:00
183243 39494 39494 2012-02-22 00:00:00
183244 39494 39494 2011-08-30 00:00:00
183245 39494 39494 2011-02-22 00:00:00
183246 39494 39494 2010-08-30 00:00:00
183247 39494 39494 2010-02-22 00:00:00
183248 39494 39494 2009-08-30 00:00:00
183249 39494 39494 2009-02-22 00:00:00
183250 39494 39494 2008-08-30 00:00:00
183251 39494 39494 2007-08-30 00:00:00
183252 39494 39494 2007-02-22 00:00:00

183253 rows × 3 columns


In [4]:
drop = matches.columns.values[-27:-1]
print drop 
#Removing other betting houses odds
matches = matches.drop(drop,1)
matches= matches.drop('BSA',1)
#Raw features
matches.columns.values


['BWH' 'BWD' 'BWA' 'IWH' 'IWD' 'IWA' 'LBH' 'LBD' 'LBA' 'PSH' 'PSD' 'PSA'
 'WHH' 'WHD' 'WHA' 'SJH' 'SJD' 'SJA' 'VCH' 'VCD' 'VCA' 'GBH' 'GBD' 'GBA'
 'BSH' 'BSD']
Out[4]:
array(['id', 'country_id', 'league_id', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_team_goal', 'away_team_goal', 'home_player_X1',
       'home_player_X2', 'home_player_X3', 'home_player_X4',
       'home_player_X5', 'home_player_X6', 'home_player_X7',
       'home_player_X8', 'home_player_X9', 'home_player_X10',
       'home_player_X11', 'away_player_X1', 'away_player_X2',
       'away_player_X3', 'away_player_X4', 'away_player_X5',
       'away_player_X6', 'away_player_X7', 'away_player_X8',
       'away_player_X9', 'away_player_X10', 'away_player_X11',
       'home_player_Y1', 'home_player_Y2', 'home_player_Y3',
       'home_player_Y4', 'home_player_Y5', 'home_player_Y6',
       'home_player_Y7', 'home_player_Y8', 'home_player_Y9',
       'home_player_Y10', 'home_player_Y11', 'away_player_Y1',
       'away_player_Y2', 'away_player_Y3', 'away_player_Y4',
       'away_player_Y5', 'away_player_Y6', 'away_player_Y7',
       'away_player_Y8', 'away_player_Y9', 'away_player_Y10',
       'away_player_Y11', 'home_player_1', 'home_player_2',
       'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
       'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
       'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
       'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
       'away_player_8', 'away_player_9', 'away_player_10',
       'away_player_11', 'goal', 'shoton', 'shotoff', 'foulcommit', 'card',
       'cross', 'corner', 'possession', 'B365H', 'B365D', 'B365A'], dtype=object)

In [5]:
matches.columns.values


Out[5]:
array(['id', 'country_id', 'league_id', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_team_goal', 'away_team_goal', 'home_player_X1',
       'home_player_X2', 'home_player_X3', 'home_player_X4',
       'home_player_X5', 'home_player_X6', 'home_player_X7',
       'home_player_X8', 'home_player_X9', 'home_player_X10',
       'home_player_X11', 'away_player_X1', 'away_player_X2',
       'away_player_X3', 'away_player_X4', 'away_player_X5',
       'away_player_X6', 'away_player_X7', 'away_player_X8',
       'away_player_X9', 'away_player_X10', 'away_player_X11',
       'home_player_Y1', 'home_player_Y2', 'home_player_Y3',
       'home_player_Y4', 'home_player_Y5', 'home_player_Y6',
       'home_player_Y7', 'home_player_Y8', 'home_player_Y9',
       'home_player_Y10', 'home_player_Y11', 'away_player_Y1',
       'away_player_Y2', 'away_player_Y3', 'away_player_Y4',
       'away_player_Y5', 'away_player_Y6', 'away_player_Y7',
       'away_player_Y8', 'away_player_Y9', 'away_player_Y10',
       'away_player_Y11', 'home_player_1', 'home_player_2',
       'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
       'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
       'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
       'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
       'away_player_8', 'away_player_9', 'away_player_10',
       'away_player_11', 'goal', 'shoton', 'shotoff', 'foulcommit', 'card',
       'cross', 'corner', 'possession', 'B365H', 'B365D', 'B365A'], dtype=object)

In [6]:
matches= matches.drop(['goal', 'shoton', 'shotoff', 'foulcommit', 'card',
       'cross', 'corner', 'possession'],1)

In [7]:
#Transforming date column into a date type
matches['date']=pd.to_datetime(matches['date'], format='%Y-%m-%d %H:%M:%S.%f')
#
matches['date']=matches['date']-matches['date'].unique()[0]
matches['date']=matches['date'].astype('timedelta64[D]')

In [8]:
#matches=matches.drop(['home_player_1', 'home_player_2','home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
#       'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
#       'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
#       'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
#       'away_player_8', 'away_player_9', 'away_player_10',
#       'away_player_11'])


#Transforming season column into categorical value
from sklearn import preprocessing 
le = preprocessing.LabelEncoder()
matches['season'] = le.fit_transform(matches['season'].astype('str'))

In [9]:
matches['season'].describe()


Out[9]:
count    19600.000000
mean         3.714541
std          2.215884
min          0.000000
25%          2.000000
50%          4.000000
75%          6.000000
max          7.000000
Name: season, dtype: float64

In [10]:
matches.columns.values


Out[10]:
array(['id', 'country_id', 'league_id', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_team_goal', 'away_team_goal', 'home_player_X1',
       'home_player_X2', 'home_player_X3', 'home_player_X4',
       'home_player_X5', 'home_player_X6', 'home_player_X7',
       'home_player_X8', 'home_player_X9', 'home_player_X10',
       'home_player_X11', 'away_player_X1', 'away_player_X2',
       'away_player_X3', 'away_player_X4', 'away_player_X5',
       'away_player_X6', 'away_player_X7', 'away_player_X8',
       'away_player_X9', 'away_player_X10', 'away_player_X11',
       'home_player_Y1', 'home_player_Y2', 'home_player_Y3',
       'home_player_Y4', 'home_player_Y5', 'home_player_Y6',
       'home_player_Y7', 'home_player_Y8', 'home_player_Y9',
       'home_player_Y10', 'home_player_Y11', 'away_player_Y1',
       'away_player_Y2', 'away_player_Y3', 'away_player_Y4',
       'away_player_Y5', 'away_player_Y6', 'away_player_Y7',
       'away_player_Y8', 'away_player_Y9', 'away_player_Y10',
       'away_player_Y11', 'home_player_1', 'home_player_2',
       'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
       'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
       'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
       'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
       'away_player_8', 'away_player_9', 'away_player_10',
       'away_player_11', 'B365H', 'B365D', 'B365A'], dtype=object)

In [ ]:


In [11]:
import graphlab as gl
gl.canvas.set_target('ipynb')

In [12]:
#train_data,test_data = matches_without_ids.random_split(.9, seed=0)
#matches['H']=(matches['home_team_goal']>matches['away_team_goal']).astype(int)
#matches['A']=(matches['home_team_goal']<matches['away_team_goal']).astype(int)

def determine_home_result(match):
    if match['home_team_goal'] > match['away_team_goal']:
        return 'H'
    elif match['home_team_goal'] < match['away_team_goal']:
        return 'A'
    else:
        return 'D'
    
    
matches['Output']=matches.apply(determine_home_result, axis=1)

In [13]:
matches_without_ids = matches.drop(['home_player_1', 'home_player_2',
       'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
       'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
       'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
       'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
       'away_player_8', 'away_player_9', 'away_player_10',
       'away_player_11','id','home_team_goal','away_team_goal'],1)

In [14]:
matches_without_ids.columns.values


Out[14]:
array(['country_id', 'league_id', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_player_X1', 'home_player_X2', 'home_player_X3',
       'home_player_X4', 'home_player_X5', 'home_player_X6',
       'home_player_X7', 'home_player_X8', 'home_player_X9',
       'home_player_X10', 'home_player_X11', 'away_player_X1',
       'away_player_X2', 'away_player_X3', 'away_player_X4',
       'away_player_X5', 'away_player_X6', 'away_player_X7',
       'away_player_X8', 'away_player_X9', 'away_player_X10',
       'away_player_X11', 'home_player_Y1', 'home_player_Y2',
       'home_player_Y3', 'home_player_Y4', 'home_player_Y5',
       'home_player_Y6', 'home_player_Y7', 'home_player_Y8',
       'home_player_Y9', 'home_player_Y10', 'home_player_Y11',
       'away_player_Y1', 'away_player_Y2', 'away_player_Y3',
       'away_player_Y4', 'away_player_Y5', 'away_player_Y6',
       'away_player_Y7', 'away_player_Y8', 'away_player_Y9',
       'away_player_Y10', 'away_player_Y11', 'B365H', 'B365D', 'B365A',
       'Output'], dtype=object)

In [15]:
pure_data = gl.SFrame(matches_without_ids)
train_data,test_data = pure_data.random_split(.8, seed=0)
#train,valid=train_data.random_split(.8,seed=0)
folds = gl.cross_validation.KFold(train_data, 5)


This non-commercial license of GraphLab Create for academic use is assigned to felipeapfernandes@gmail.com and will expire on August 26, 2017.
[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1476546209.log

In [16]:
model_kfolds=[]
for train,valid in folds:
    #(train,valid) = folds[i]
    model = gl.logistic_classifier.create(train,
                                                   target='Output',
                                                 features=['country_id', 'league_id', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_player_X1', 'home_player_X2', 'home_player_X3',
       'home_player_X4', 'home_player_X5', 'home_player_X6',
       'home_player_X7', 'home_player_X8', 'home_player_X9',
       'home_player_X10', 'home_player_X11', 'away_player_X1',
       'away_player_X2', 'away_player_X3', 'away_player_X4',
       'away_player_X5', 'away_player_X6', 'away_player_X7',
       'away_player_X8', 'away_player_X9', 'away_player_X10',
       'away_player_X11', 'home_player_Y1', 'home_player_Y2',
       'home_player_Y3', 'home_player_Y4', 'home_player_Y5',
       'home_player_Y6', 'home_player_Y7', 'home_player_Y8',
       'home_player_Y9', 'home_player_Y10', 'home_player_Y11',
       'away_player_Y1', 'away_player_Y2', 'away_player_Y3',
       'away_player_Y4', 'away_player_Y5', 'away_player_Y6',
       'away_player_Y7', 'away_player_Y8', 'away_player_Y9',
       'away_player_Y10', 'away_player_Y11','B365H', 'B365D', 'B365A'],
                                                validation_set=valid)
    model_kfolds.append(model)


WARNING: Detected extremely low variance for feature(s) 'home_player_Y2', 'away_player_Y2', 'away_player_Y3' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Logistic regression:
--------------------------------------------------------
Number of examples          : 12597
Number of classes           : 3
Number of feature columns   : 55
Number of unpacked features : 55
Number of coefficients    : 112
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Elapsed Time | Training-accuracy | Validation-accuracy |
+-----------+----------+--------------+-------------------+---------------------+
| 1         | 2        | 1.183762     | 0.529809          | 0.522540            |
| 2         | 3        | 1.291705     | 0.534175          | 0.526984            |
| 3         | 4        | 1.398602     | 0.533778          | 0.526984            |
| 4         | 5        | 1.526274     | 0.534016          | 0.527302            |
| 5         | 6        | 1.640237     | 0.534016          | 0.527302            |
+-----------+----------+--------------+-------------------+---------------------+
SUCCESS: Optimal solution found.

WARNING: Detected extremely low variance for feature(s) 'home_player_Y2', 'away_player_Y2', 'away_player_Y3', 'away_player_Y4' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Logistic regression:
--------------------------------------------------------
Number of examples          : 12597
Number of classes           : 3
Number of feature columns   : 55
Number of unpacked features : 55
Number of coefficients    : 112
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Elapsed Time | Training-accuracy | Validation-accuracy |
+-----------+----------+--------------+-------------------+---------------------+
| 1         | 2        | 0.185095     | 0.535524          | 0.495873            |
| 2         | 3        | 0.315123     | 0.539811          | 0.500317            |
| 3         | 4        | 0.424920     | 0.540287          | 0.498413            |
| 4         | 5        | 0.534586     | 0.540208          | 0.499048            |
| 5         | 6        | 0.649053     | 0.540208          | 0.499048            |
+-----------+----------+--------------+-------------------+---------------------+
SUCCESS: Optimal solution found.

WARNING: Detected extremely low variance for feature(s) 'home_player_X1', 'home_player_Y1', 'home_player_Y2', 'home_player_Y3', 'away_player_Y2', 'away_player_Y3' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Logistic regression:
--------------------------------------------------------
Number of examples          : 12598
Number of classes           : 3
Number of feature columns   : 55
Number of unpacked features : 55
Number of coefficients    : 112
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Elapsed Time | Training-accuracy | Validation-accuracy |
+-----------+----------+--------------+-------------------+---------------------+
| 1         | 2        | 0.171700     | 0.530243          | 0.530010            |
| 2         | 3        | 0.290582     | 0.534688          | 0.527151            |
| 3         | 4        | 0.402009     | 0.535164          | 0.528422            |
| 4         | 5        | 0.520443     | 0.535085          | 0.528739            |
| 5         | 6        | 0.632263     | 0.535085          | 0.528739            |
+-----------+----------+--------------+-------------------+---------------------+
SUCCESS: Optimal solution found.

WARNING: Detected extremely low variance for feature(s) 'home_player_Y2', 'away_player_Y2', 'away_player_Y3' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Logistic regression:
--------------------------------------------------------
Number of examples          : 12598
Number of classes           : 3
Number of feature columns   : 55
Number of unpacked features : 55
Number of coefficients    : 112
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Elapsed Time | Training-accuracy | Validation-accuracy |
+-----------+----------+--------------+-------------------+---------------------+
| 1         | 2        | 0.172361     | 0.526512          | 0.542394            |
| 2         | 3        | 0.285408     | 0.529767          | 0.543347            |
| 3         | 4        | 0.401147     | 0.530005          | 0.543347            |
| 4         | 5        | 0.524673     | 0.529687          | 0.543347            |
| 5         | 6        | 0.635208     | 0.529687          | 0.543347            |
+-----------+----------+--------------+-------------------+---------------------+
SUCCESS: Optimal solution found.

WARNING: Detected extremely low variance for feature(s) 'home_player_Y2', 'away_player_Y2', 'away_player_Y3' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Logistic regression:
--------------------------------------------------------
Number of examples          : 12598
Number of classes           : 3
Number of feature columns   : 55
Number of unpacked features : 55
Number of coefficients    : 112
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Elapsed Time | Training-accuracy | Validation-accuracy |
+-----------+----------+--------------+-------------------+---------------------+
| 1         | 2        | 0.178313     | 0.526433          | 0.544935            |
| 2         | 3        | 0.292984     | 0.528655          | 0.544300            |
| 3         | 4        | 0.405852     | 0.528576          | 0.543982            |
| 4         | 5        | 0.514648     | 0.528497          | 0.543982            |
+-----------+----------+--------------+-------------------+---------------------+
SUCCESS: Optimal solution found.


In [17]:
#result=pure_model.evaluate(test_data)
result=[]
for model in model_kfolds:
    result.append(model.evaluate(test_data))

In [18]:
#pure_model.show(view='Evaluation')
result


Out[18]:
[{'accuracy': 0.5284194134440695,
  'auc': 0.6526528046893878,
  'confusion_matrix': Columns:
  	target_label	str
  	predicted_label	str
  	count	int
  
  Rows: 9
  
  Data:
  +--------------+-----------------+-------+
  | target_label | predicted_label | count |
  +--------------+-----------------+-------+
  |      H       |        D        |   8   |
  |      A       |        D        |   12  |
  |      D       |        D        |   8   |
  |      D       |        H        |  710  |
  |      H       |        H        |  1495 |
  |      H       |        A        |  249  |
  |      D       |        A        |  252  |
  |      A       |        A        |  533  |
  |      A       |        H        |  586  |
  +--------------+-----------------+-------+
  [9 rows x 3 columns],
  'f1_score': 0.3888554069777361,
  'log_loss': 0.9841838473238556,
  'precision': 0.4456128260263131,
  'recall': 0.44427409759320313,
  'roc_curve': Columns:
  	threshold	float
  	fpr	float
  	tpr	float
  	p	int
  	n	int
  	class	int
  
  Rows: 300003
  
  Data:
  +-----------+----------------+-----+------+------+-------+
  | threshold |      fpr       | tpr |  p   |  n   | class |
  +-----------+----------------+-----+------+------+-------+
  |    0.0    |      1.0       | 1.0 | 1131 | 2722 |   0   |
  |   1e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   2e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   3e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   4e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   5e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   6e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   7e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   8e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   9e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  +-----------+----------------+-----+------+------+-------+
  [300003 rows x 6 columns]
  Note: Only the head of the SFrame is printed.
  You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.},
 {'accuracy': 0.5242668050869452,
  'auc': 0.6532557245341993,
  'confusion_matrix': Columns:
  	target_label	str
  	predicted_label	str
  	count	int
  
  Rows: 9
  
  Data:
  +--------------+-----------------+-------+
  | target_label | predicted_label | count |
  +--------------+-----------------+-------+
  |      A       |        D        |   8   |
  |      D       |        D        |   6   |
  |      H       |        D        |   11  |
  |      H       |        H        |  1505 |
  |      D       |        H        |  717  |
  |      D       |        A        |  247  |
  |      H       |        A        |  236  |
  |      A       |        A        |  509  |
  |      A       |        H        |  614  |
  +--------------+-----------------+-------+
  [9 rows x 3 columns],
  'f1_score': 0.3825432379258673,
  'log_loss': 0.983617354540887,
  'precision': 0.42792728286091264,
  'recall': 0.43841601350513004,
  'roc_curve': Columns:
  	threshold	float
  	fpr	float
  	tpr	float
  	p	int
  	n	int
  	class	int
  
  Rows: 300003
  
  Data:
  +-----------+----------------+-----+------+------+-------+
  | threshold |      fpr       | tpr |  p   |  n   | class |
  +-----------+----------------+-----+------+------+-------+
  |    0.0    |      1.0       | 1.0 | 1131 | 2722 |   0   |
  |   1e-05   |      1.0       | 1.0 | 1131 | 2722 |   0   |
  |   2e-05   |      1.0       | 1.0 | 1131 | 2722 |   0   |
  |   3e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   4e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   5e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   6e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   7e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   8e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   9e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  +-----------+----------------+-----+------+------+-------+
  [300003 rows x 6 columns]
  Note: Only the head of the SFrame is printed.
  You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.},
 {'accuracy': 0.5258240332208669,
  'auc': 0.6565211030211825,
  'confusion_matrix': Columns:
  	target_label	str
  	predicted_label	str
  	count	int
  
  Rows: 9
  
  Data:
  +--------------+-----------------+-------+
  | target_label | predicted_label | count |
  +--------------+-----------------+-------+
  |      H       |        D        |   27  |
  |      A       |        D        |   22  |
  |      D       |        D        |   19  |
  |      D       |        H        |  697  |
  |      H       |        H        |  1489 |
  |      H       |        A        |  236  |
  |      D       |        A        |  254  |
  |      A       |        A        |  518  |
  |      A       |        H        |  591  |
  +--------------+-----------------+-------+
  [9 rows x 3 columns],
  'f1_score': 0.3928292116642802,
  'log_loss': 0.9824502417796135,
  'precision': 0.4431635956106925,
  'recall': 0.4424917473204779,
  'roc_curve': Columns:
  	threshold	float
  	fpr	float
  	tpr	float
  	p	int
  	n	int
  	class	int
  
  Rows: 300003
  
  Data:
  +-----------+----------------+-----+------+------+-------+
  | threshold |      fpr       | tpr |  p   |  n   | class |
  +-----------+----------------+-----+------+------+-------+
  |    0.0    |      1.0       | 1.0 | 1131 | 2722 |   0   |
  |   1e-05   |      1.0       | 1.0 | 1131 | 2722 |   0   |
  |   2e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   3e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   4e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   5e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   6e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   7e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   8e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   9e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  +-----------+----------------+-----+------+------+-------+
  [300003 rows x 6 columns]
  Note: Only the head of the SFrame is printed.
  You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.},
 {'accuracy': 0.5263431092655074,
  'auc': 0.6549354976301381,
  'confusion_matrix': Columns:
  	target_label	str
  	predicted_label	str
  	count	int
  
  Rows: 9
  
  Data:
  +--------------+-----------------+-------+
  | target_label | predicted_label | count |
  +--------------+-----------------+-------+
  |      H       |        D        |   18  |
  |      A       |        D        |   18  |
  |      D       |        D        |   21  |
  |      D       |        H        |  724  |
  |      H       |        H        |  1511 |
  |      H       |        A        |  223  |
  |      D       |        A        |  225  |
  |      A       |        A        |  496  |
  |      A       |        H        |  617  |
  +--------------+-----------------+-------+
  [9 rows x 3 columns],
  'f1_score': 0.39178461790825175,
  'log_loss': 0.9824630639160944,
  'precision': 0.4745494760029844,
  'recall': 0.4408807875672823,
  'roc_curve': Columns:
  	threshold	float
  	fpr	float
  	tpr	float
  	p	int
  	n	int
  	class	int
  
  Rows: 300003
  
  Data:
  +-----------+----------------+-----+------+------+-------+
  | threshold |      fpr       | tpr |  p   |  n   | class |
  +-----------+----------------+-----+------+------+-------+
  |    0.0    |      1.0       | 1.0 | 1131 | 2722 |   0   |
  |   1e-05   |      1.0       | 1.0 | 1131 | 2722 |   0   |
  |   2e-05   |      1.0       | 1.0 | 1131 | 2722 |   0   |
  |   3e-05   |      1.0       | 1.0 | 1131 | 2722 |   0   |
  |   4e-05   |      1.0       | 1.0 | 1131 | 2722 |   0   |
  |   5e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   6e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   7e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   8e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   9e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  +-----------+----------------+-----+------+------+-------+
  [300003 rows x 6 columns]
  Note: Only the head of the SFrame is printed.
  You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.},
 {'accuracy': 0.5258240332208669,
  'auc': 0.6528051553133635,
  'confusion_matrix': Columns:
  	target_label	str
  	predicted_label	str
  	count	int
  
  Rows: 9
  
  Data:
  +--------------+-----------------+-------+
  | target_label | predicted_label | count |
  +--------------+-----------------+-------+
  |      A       |        D        |   23  |
  |      D       |        D        |   24  |
  |      H       |        D        |   27  |
  |      D       |        H        |  704  |
  |      H       |        H        |  1484 |
  |      H       |        A        |  241  |
  |      D       |        A        |  242  |
  |      A       |        A        |  518  |
  |      A       |        H        |  590  |
  +--------------+-----------------+-------+
  [9 rows x 3 columns],
  'f1_score': 0.3956977849680828,
  'log_loss': 0.985605370988062,
  'precision': 0.4586680353419015,
  'recall': 0.44325866661938423,
  'roc_curve': Columns:
  	threshold	float
  	fpr	float
  	tpr	float
  	p	int
  	n	int
  	class	int
  
  Rows: 300003
  
  Data:
  +-----------+----------------+-----+------+------+-------+
  | threshold |      fpr       | tpr |  p   |  n   | class |
  +-----------+----------------+-----+------+------+-------+
  |    0.0    |      1.0       | 1.0 | 1131 | 2722 |   0   |
  |   1e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   2e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   3e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   4e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   5e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   6e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   7e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   8e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  |   9e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
  +-----------+----------------+-----+------+------+-------+
  [300003 rows x 6 columns]
  Note: Only the head of the SFrame is printed.
  You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}]

In [19]:
top = model.predict_topk(test_data, output_type='probability', k = 3)

In [20]:
print top


+----+-------+----------------+
| id | class |  probability   |
+----+-------+----------------+
| 0  |   H   | 0.576467886553 |
| 0  |   D   | 0.234468528222 |
| 0  |   A   | 0.189063585225 |
| 1  |   H   | 0.388531413143 |
| 1  |   A   | 0.356242476119 |
| 1  |   D   | 0.255226110738 |
| 2  |   H   | 0.483082240589 |
| 2  |   D   | 0.272892422591 |
| 2  |   A   | 0.24402533682  |
| 3  |   H   | 0.502684594609 |
+----+-------+----------------+
[11559 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [21]:
pred = model.predict(test_data)

In [22]:
(test_data['Output']==pred).sum()/(len(pred)*1.0)


Out[22]:
0.5258240332208669

In [ ]:


In [23]:
import re
def def_formations(matches_positions):
    pos=matches_positions.to_dataframe()
    form=[]
    for index,row in pos.iterrows():
        b= row.values
        dfs = (b <= 3).sum()
        mid1 = ((b >= 4) & (b<=6)).sum()
        mid2 = ((b >= 7) & (b<=9)).sum()
        atk1 = ((b >= 10)).sum()
        formation="%d-%d-%d-%d"%(dfs,mid1,mid2,atk1)
        formation = re.sub('0-','',formation)
        form.append(formation)
    return form

In [24]:
positions_home= pure_data[['home_player_Y2',
                     'home_player_Y3',
                     'home_player_Y4',
                     'home_player_Y5',
                     'home_player_Y6',
                     'home_player_Y7',
                     'home_player_Y8',
                     'home_player_Y9',
                     'home_player_Y10',
                     'home_player_Y11']]
positions_away = pure_data[['away_player_Y2',
                     'away_player_Y3',
                     'away_player_Y4',
                     'away_player_Y5',
                     'away_player_Y6',
                     'away_player_Y7',
                     'away_player_Y8',
                     'away_player_Y9',
                     'away_player_Y10',
                     'away_player_Y11']]
formation_home=def_formations(positions_home)
formation_away=def_formations(positions_away)

In [25]:
pure_data['formation_h']=formation_home
pure_data['formation_a']=formation_away

pure_data['formation_a'].unique()


Out[25]:
dtype: str
Rows: 19
['3-2-3-2', '3-5-2', '4-2-2-2', '3-1-4-2', '3-4-3', '4-4-2', '4-1-2-3', '4-2-1-3', '3-3-3-1', '4-3-2-1', '4-1-4-1', '4-3-1-2', '4-5-1', '5-3-2', '4-1-3-2', '4-3-3', '4-2-3-1', '3-6-1', '5-4-1']

In [26]:
data= pure_data

In [27]:
data=pure_data.to_dataframe().drop(['home_player_X1', 'home_player_X2', 'home_player_X3',
       'home_player_X4', 'home_player_X5', 'home_player_X6',
       'home_player_X7', 'home_player_X8', 'home_player_X9',
       'home_player_X10', 'home_player_X11', 'away_player_X1',
       'away_player_X2', 'away_player_X3', 'away_player_X4',
       'away_player_X5', 'away_player_X6', 'away_player_X7',
       'away_player_X8', 'away_player_X9', 'away_player_X10',
       'away_player_X11', 'home_player_Y1', 'home_player_Y2',
       'home_player_Y3', 'home_player_Y4', 'home_player_Y5',
       'home_player_Y6', 'home_player_Y7', 'home_player_Y8',
       'home_player_Y9', 'home_player_Y10', 'home_player_Y11',
       'away_player_Y1', 'away_player_Y2', 'away_player_Y3',
       'away_player_Y4', 'away_player_Y5', 'away_player_Y6',
       'away_player_Y7', 'away_player_Y8', 'away_player_Y9',
       'away_player_Y10', 'away_player_Y11'],1)

In [28]:
data = gl.SFrame(data)
data.show()



In [29]:
train_data,test_data = data.random_split(.8, seed=0)
train,valid=train_data.random_split(.8,seed=0)
model = gl.logistic_classifier.create(train,target='Output',
                                      features=['country_id','league_id','season','stage','date','match_api_id',
                                                'home_team_api_id','away_team_api_id','B365H','B365D','B365A',
                                                'formation_h','formation_a'], validation_set=valid)


Logistic regression:
--------------------------------------------------------
Number of examples          : 12674
Number of classes           : 3
Number of feature columns   : 13
Number of unpacked features : 13
Number of coefficients    : 96
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Elapsed Time | Training-accuracy | Validation-accuracy |
+-----------+----------+--------------+-------------------+---------------------+
| 1         | 2        | 0.089409     | 0.530614          | 0.526196            |
| 2         | 3        | 0.149522     | 0.534007          | 0.529450            |
| 3         | 4        | 0.209564     | 0.533139          | 0.530426            |
| 4         | 5        | 0.270951     | 0.533297          | 0.530101            |
| 5         | 6        | 0.332927     | 0.533297          | 0.530101            |
| 6         | 7        | 0.393710     | 0.533297          | 0.530101            |
+-----------+----------+--------------+-------------------+---------------------+

In [30]:
model.evaluate(test_data)


SUCCESS: Optimal solution found.

Out[30]:
{'accuracy': 0.5315338697119127,
 'auc': 0.6526445952145282,
 'confusion_matrix': Columns:
 	target_label	str
 	predicted_label	str
 	count	int
 
 Rows: 9
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      H       |        D        |   13  |
 |      A       |        D        |   15  |
 |      D       |        D        |   18  |
 |      H       |        H        |  1511 |
 |      D       |        H        |  708  |
 |      D       |        A        |  244  |
 |      H       |        A        |  228  |
 |      A       |        A        |  519  |
 |      A       |        H        |  597  |
 +--------------+-----------------+-------+
 [9 rows x 3 columns],
 'f1_score': 0.39538430285888615,
 'log_loss': 0.9864334331563913,
 'precision': 0.4838648243862084,
 'recall': 0.4466285216833064,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 	class	int
 
 Rows: 300003
 
 Data:
 +-----------+----------------+-----+------+------+-------+
 | threshold |      fpr       | tpr |  p   |  n   | class |
 +-----------+----------------+-----+------+------+-------+
 |    0.0    |      1.0       | 1.0 | 1131 | 2722 |   0   |
 |   1e-05   |      1.0       | 1.0 | 1131 | 2722 |   0   |
 |   2e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
 |   3e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
 |   4e-05   | 0.999632623071 | 1.0 | 1131 | 2722 |   0   |
 |   5e-05   | 0.999265246143 | 1.0 | 1131 | 2722 |   0   |
 |   6e-05   | 0.999265246143 | 1.0 | 1131 | 2722 |   0   |
 |   7e-05   | 0.999265246143 | 1.0 | 1131 | 2722 |   0   |
 |   8e-05   | 0.999265246143 | 1.0 | 1131 | 2722 |   0   |
 |   9e-05   | 0.999265246143 | 1.0 | 1131 | 2722 |   0   |
 +-----------+----------------+-----+------+------+-------+
 [300003 rows x 6 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}

In [31]:
pred = model.predict(test_data)
(test_data['Output']==pred).sum()/(len(pred)*1.0)


Out[31]:
0.5315338697119127

Feature Engineering


In [40]:
def ExtractGoalTendency(values):
    #data = values.to_dataframe()
    values.sort_values(by=['league_id','season','stage'])
    return values
    
    
    

data_goals = ExtractGoalTendency(data.to_dataframe())
data_goals


Out[40]:
country_id league_id season stage date match_api_id home_team_api_id away_team_api_id B365H B365D B365A Output formation_h formation_a
0 1 1 0 24 0.0 493017 8203 9987 3.00 3.40 2.30 H 4-4-2 4-4-2
1 1 1 0 25 9.0 493025 9984 8342 2.80 3.20 2.37 A 4-4-2 4-4-2
2 1 1 0 25 8.0 493027 8635 10000 1.40 4.50 8.00 H 4-4-2 4-4-2
3 1 1 0 26 14.0 493034 8203 8635 4.50 3.60 1.75 H 4-4-2 4-4-2
4 1 1 0 26 15.0 493040 10000 9999 1.75 3.50 4.75 D 4-4-2 4-4-2
5 1 1 0 27 23.0 493045 9991 10000 1.65 3.60 4.75 H 4-4-2 4-4-2
6 1 1 0 27 22.0 493048 9999 8203 2.20 3.30 3.00 H 4-4-2 4-4-2
7 1 1 0 29 44.0 493061 8635 8342 1.67 3.50 5.00 H 4-4-2 4-4-2
8 1 1 0 29 42.0 493062 9999 9987 3.20 3.25 2.25 H 4-4-2 4-4-2
9 1 1 0 31 58.0 493082 9999 9991 3.20 3.30 2.10 A 4-4-2 4-4-2
10 1 1 0 32 64.0 493089 10000 9985 5.50 3.60 1.57 D 4-4-2 4-4-2
11 1 1 0 32 64.0 493092 9991 9984 1.36 4.33 7.50 H 4-4-2 4-4-2
12 1 1 0 32 64.0 493094 10001 9999 1.57 3.60 5.50 H 4-4-2 4-4-2
13 1 1 0 33 71.0 493097 9985 8342 1.44 4.33 7.00 H 4-4-2 4-4-2
14 1 1 0 33 71.0 493103 8635 9999 1.14 7.50 19.00 H 4-4-2 4-4-2
15 1 1 0 33 71.0 493105 9984 10001 2.62 3.30 2.62 H 4-4-2 4-4-2
16 1 1 0 34 78.0 493106 9987 8635 8.00 4.00 1.40 A 4-4-2 4-4-2
17 1 1 0 34 78.0 493107 9991 9985 4.33 3.80 1.67 A 4-4-2 4-4-2
18 1 1 1 1 156.0 665321 9984 9991 2.88 3.30 2.25 A 4-4-2 4-4-2
19 1 1 1 1 155.0 665322 9994 10000 2.30 3.30 2.88 D 4-4-2 4-4-2
20 1 1 1 1 155.0 665323 8571 8635 5.75 3.60 1.57 A 4-4-2 4-4-2
21 1 1 1 10 219.0 665411 8342 8635 2.50 3.25 2.60 H 4-4-2 4-4-2
22 1 1 1 10 218.0 665417 8203 9994 1.91 3.30 4.20 H 4-4-2 4-4-2
23 1 1 1 10 218.0 665421 9993 10001 1.73 3.50 5.00 H 4-4-2 4-4-2
24 1 1 1 11 233.0 665425 8342 9991 1.70 3.40 4.75 H 4-4-2 4-4-2
25 1 1 1 11 233.0 665427 9987 8203 1.85 3.30 4.00 A 4-4-2 4-4-2
26 1 1 1 11 232.0 665429 10000 9993 1.91 3.30 3.80 H 4-4-2 4-4-2
27 1 1 1 11 232.0 665430 9994 9984 2.10 3.20 3.30 D 4-4-2 4-4-2
28 1 1 1 11 232.0 665435 10001 9985 5.50 3.50 1.60 H 4-4-2 4-4-2
29 1 1 1 12 239.0 665438 9985 10000 1.50 4.00 7.00 D 4-4-2 4-4-2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19570 21484 21484 7 6 2403.0 2030140 9783 8558 2.10 3.30 3.75 H 4-4-2 4-4-2
19571 21484 21484 7 6 2403.0 2030141 9869 8603 2.30 3.20 3.30 A 4-4-2 4-2-3-1
19572 21484 21484 7 7 2409.0 2030142 8302 8634 4.00 3.60 1.91 H 4-2-3-1 4-3-3
19573 21484 21484 7 7 2408.0 2030144 9910 8305 1.44 4.50 7.50 D 4-3-3 4-2-3-1
19574 21484 21484 7 7 2410.0 2030145 8581 10205 4.33 3.40 1.91 H 5-3-2 4-4-2
19575 21484 21484 7 7 2410.0 2030146 9906 8633 3.20 3.30 2.30 D 4-4-2 4-3-3
19576 21484 21484 7 7 2409.0 2030147 9864 8560 2.38 3.10 3.30 H 4-2-3-1 4-2-3-1
19577 21484 21484 7 7 2410.0 2030148 8315 10267 2.30 3.30 3.20 H 4-2-3-1 4-2-3-1
19578 21484 21484 7 7 2409.0 2030149 7878 9783 2.60 3.20 2.80 D 4-2-3-1 4-4-2
19579 21484 21484 7 7 2409.0 2030150 8558 9869 2.00 3.30 4.00 A 4-2-3-1 4-2-3-1
19580 21484 21484 7 7 2410.0 2030151 8370 8603 2.20 3.50 3.20 A 4-2-3-1 4-4-2
19581 21484 21484 7 8 2423.0 2030152 8634 8370 1.14 9.00 15.00 H 4-3-3 4-2-3-1
19582 21484 21484 7 8 2423.0 2030153 8372 8302 3.60 3.40 2.10 D 4-2-3-1 4-2-3-1
19583 21484 21484 7 8 2424.0 2030155 10205 9910 2.00 3.50 3.75 A 4-4-2 4-3-3
19584 21484 21484 7 8 2423.0 2030156 8633 8581 1.08 12.00 23.00 H 4-3-3 5-3-2
19585 21484 21484 7 8 2424.0 2030157 8560 9906 4.00 3.20 2.05 A 4-2-3-1 4-4-2
19586 21484 21484 7 8 2423.0 2030158 10267 9864 1.62 3.75 6.00 H 4-2-3-1 4-2-3-1
19587 21484 21484 7 8 2424.0 2030159 9783 8315 2.70 3.10 2.80 D 4-4-2 4-2-3-1
19588 21484 21484 7 8 2425.0 2030160 9869 7878 2.00 3.20 4.20 D 4-2-3-1 4-1-4-1
19589 21484 21484 7 8 2423.0 2030161 8603 8558 2.00 3.30 4.00 A 4-2-3-1 4-2-3-1
19590 21484 21484 7 9 2431.0 2030162 8634 8372 1.11 10.00 19.00 H 4-3-3 4-2-3-1
19591 21484 21484 7 9 2430.0 2030163 8302 8305 1.44 4.33 8.00 H 4-2-3-1 4-2-3-1
19592 21484 21484 7 9 2431.0 2030164 8306 10205 3.50 3.25 2.20 D 4-2-3-1 4-4-2
19593 21484 21484 7 9 2430.0 2030165 9910 8633 3.80 3.80 1.91 A 4-3-3 4-3-3
19594 21484 21484 7 9 2431.0 2030166 8581 8560 2.63 3.20 2.80 A 5-3-2 4-2-3-1
19595 21484 21484 7 9 2431.0 2030167 9906 10267 1.57 3.80 6.50 H 4-3-3 4-4-2
19596 21484 21484 7 9 2430.0 2030168 9864 9783 2.25 3.25 3.40 H 4-2-3-1 4-4-2
19597 21484 21484 7 9 2432.0 2030169 8315 9869 1.53 4.00 7.00 H 4-2-3-1 4-3-3
19598 21484 21484 7 9 2430.0 2030170 7878 8603 2.30 3.25 3.25 D 4-3-3 4-4-2
19599 21484 21484 7 9 2429.0 2030171 8370 8558 2.20 3.40 3.20 H 4-2-3-1 4-2-3-1

19600 rows × 14 columns