In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sqlite3
import numpy as np
from numpy import random
database = 'database.sqlite'
conn = sqlite3.connect(database)
In [2]:
query = """select * from match where home_player_X1 is not null and 'home_player_Y1' is not null
and home_player_X2 is not null and 'home_player_Y2' is not null
and home_player_X3 is not null and home_player_Y3 is not null
and home_player_X4 is not null and home_player_Y4 is not null
and home_player_X5 is not null and home_player_Y5 is not null
and home_player_X6 is not null and home_player_Y6 is not null
and home_player_X7 is not null and home_player_Y7 is not null
and home_player_X8 is not null and home_player_Y8 is not null
and home_player_X9 is not null and home_player_Y9 is not null
and home_player_X10 is not null and home_player_Y10 is not null
and home_player_X11 is not null and home_player_Y11 is not null
and away_player_X1 is not null and away_player_Y1 is not null
and away_player_X2 is not null and away_player_Y2 is not null
and away_player_X3 is not null and away_player_Y3 is not null
and away_player_X4 is not null and away_player_Y4 is not null
and away_player_X5 is not null and away_player_Y5 is not null
and away_player_X6 is not null and away_player_Y6 is not null
and away_player_X7 is not null and away_player_Y7 is not null
and away_player_X8 is not null and away_player_Y8 is not null
and away_player_X9 is not null and away_player_Y9 is not null
and away_player_X10 is not null and away_player_Y10 is not null
and away_player_X11 is not null and away_player_Y11 is not null
and home_team_goal is not null and away_team_goal is not null
and home_player_1 is not null
and home_player_2 is not null
and home_player_3 is not null
and home_player_4 is not null
and home_player_5 is not null
and home_player_6 is not null
and home_player_7 is not null
and home_player_8 is not null
and home_player_9 is not null
and home_player_10 is not null
and home_player_11 is not null
and away_player_1 is not null
and away_player_2 is not null
and away_player_3 is not null
and away_player_4 is not null
and away_player_5 is not null
and away_player_6 is not null
and away_player_7 is not null
and away_player_8 is not null
and away_player_9 is not null
and away_player_10 is not null
and away_player_11 is not null
and B365H is not null and B365D is not null and B365A is not null;"""
matches = pd.read_sql(query, conn)
matches
Out[2]:
In [3]:
query = """ select * from Player as p ,Player_Stats as s where p.player_api_id = s.player_api_id;"""
player = pd.read_sql(query, conn)
player[['player_api_id','date_stat']]
Out[3]:
In [4]:
drop = matches.columns.values[-27:-1]
print drop
#Removing other betting houses odds
matches = matches.drop(drop,1)
matches= matches.drop('BSA',1)
#Raw features
matches.columns.values
Out[4]:
In [5]:
matches.columns.values
Out[5]:
In [6]:
matches= matches.drop(['goal', 'shoton', 'shotoff', 'foulcommit', 'card',
'cross', 'corner', 'possession'],1)
In [7]:
#Transforming date column into a date type
matches['date']=pd.to_datetime(matches['date'], format='%Y-%m-%d %H:%M:%S.%f')
#
matches['date']=matches['date']-matches['date'].unique()[0]
matches['date']=matches['date'].astype('timedelta64[D]')
In [8]:
#matches=matches.drop(['home_player_1', 'home_player_2','home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
# 'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
# 'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
# 'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
# 'away_player_8', 'away_player_9', 'away_player_10',
# 'away_player_11'])
#Transforming season column into categorical value
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
matches['season'] = le.fit_transform(matches['season'].astype('str'))
In [9]:
matches['season'].describe()
Out[9]:
In [10]:
matches.columns.values
Out[10]:
In [ ]:
In [11]:
import graphlab as gl
gl.canvas.set_target('ipynb')
In [12]:
#train_data,test_data = matches_without_ids.random_split(.9, seed=0)
#matches['H']=(matches['home_team_goal']>matches['away_team_goal']).astype(int)
#matches['A']=(matches['home_team_goal']<matches['away_team_goal']).astype(int)
def determine_home_result(match):
if match['home_team_goal'] > match['away_team_goal']:
return 'H'
elif match['home_team_goal'] < match['away_team_goal']:
return 'A'
else:
return 'D'
matches['Output']=matches.apply(determine_home_result, axis=1)
In [13]:
matches_without_ids = matches.drop(['home_player_1', 'home_player_2',
'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
'away_player_8', 'away_player_9', 'away_player_10',
'away_player_11','id','home_team_goal','away_team_goal'],1)
In [14]:
matches_without_ids.columns.values
Out[14]:
In [15]:
pure_data = gl.SFrame(matches_without_ids)
train_data,test_data = pure_data.random_split(.8, seed=0)
#train,valid=train_data.random_split(.8,seed=0)
folds = gl.cross_validation.KFold(train_data, 5)
In [16]:
model_kfolds=[]
for train,valid in folds:
#(train,valid) = folds[i]
model = gl.logistic_classifier.create(train,
target='Output',
features=['country_id', 'league_id', 'season', 'stage', 'date',
'match_api_id', 'home_team_api_id', 'away_team_api_id',
'home_player_X1', 'home_player_X2', 'home_player_X3',
'home_player_X4', 'home_player_X5', 'home_player_X6',
'home_player_X7', 'home_player_X8', 'home_player_X9',
'home_player_X10', 'home_player_X11', 'away_player_X1',
'away_player_X2', 'away_player_X3', 'away_player_X4',
'away_player_X5', 'away_player_X6', 'away_player_X7',
'away_player_X8', 'away_player_X9', 'away_player_X10',
'away_player_X11', 'home_player_Y1', 'home_player_Y2',
'home_player_Y3', 'home_player_Y4', 'home_player_Y5',
'home_player_Y6', 'home_player_Y7', 'home_player_Y8',
'home_player_Y9', 'home_player_Y10', 'home_player_Y11',
'away_player_Y1', 'away_player_Y2', 'away_player_Y3',
'away_player_Y4', 'away_player_Y5', 'away_player_Y6',
'away_player_Y7', 'away_player_Y8', 'away_player_Y9',
'away_player_Y10', 'away_player_Y11','B365H', 'B365D', 'B365A'],
validation_set=valid)
model_kfolds.append(model)
In [17]:
#result=pure_model.evaluate(test_data)
result=[]
for model in model_kfolds:
result.append(model.evaluate(test_data))
In [18]:
#pure_model.show(view='Evaluation')
result
Out[18]:
In [19]:
top = model.predict_topk(test_data, output_type='probability', k = 3)
In [20]:
print top
In [21]:
pred = model.predict(test_data)
In [22]:
(test_data['Output']==pred).sum()/(len(pred)*1.0)
Out[22]:
In [ ]:
In [23]:
import re
def def_formations(matches_positions):
pos=matches_positions.to_dataframe()
form=[]
for index,row in pos.iterrows():
b= row.values
dfs = (b <= 3).sum()
mid1 = ((b >= 4) & (b<=6)).sum()
mid2 = ((b >= 7) & (b<=9)).sum()
atk1 = ((b >= 10)).sum()
formation="%d-%d-%d-%d"%(dfs,mid1,mid2,atk1)
formation = re.sub('0-','',formation)
form.append(formation)
return form
In [24]:
positions_home= pure_data[['home_player_Y2',
'home_player_Y3',
'home_player_Y4',
'home_player_Y5',
'home_player_Y6',
'home_player_Y7',
'home_player_Y8',
'home_player_Y9',
'home_player_Y10',
'home_player_Y11']]
positions_away = pure_data[['away_player_Y2',
'away_player_Y3',
'away_player_Y4',
'away_player_Y5',
'away_player_Y6',
'away_player_Y7',
'away_player_Y8',
'away_player_Y9',
'away_player_Y10',
'away_player_Y11']]
formation_home=def_formations(positions_home)
formation_away=def_formations(positions_away)
In [25]:
pure_data['formation_h']=formation_home
pure_data['formation_a']=formation_away
pure_data['formation_a'].unique()
Out[25]:
In [26]:
data= pure_data
In [27]:
data=pure_data.to_dataframe().drop(['home_player_X1', 'home_player_X2', 'home_player_X3',
'home_player_X4', 'home_player_X5', 'home_player_X6',
'home_player_X7', 'home_player_X8', 'home_player_X9',
'home_player_X10', 'home_player_X11', 'away_player_X1',
'away_player_X2', 'away_player_X3', 'away_player_X4',
'away_player_X5', 'away_player_X6', 'away_player_X7',
'away_player_X8', 'away_player_X9', 'away_player_X10',
'away_player_X11', 'home_player_Y1', 'home_player_Y2',
'home_player_Y3', 'home_player_Y4', 'home_player_Y5',
'home_player_Y6', 'home_player_Y7', 'home_player_Y8',
'home_player_Y9', 'home_player_Y10', 'home_player_Y11',
'away_player_Y1', 'away_player_Y2', 'away_player_Y3',
'away_player_Y4', 'away_player_Y5', 'away_player_Y6',
'away_player_Y7', 'away_player_Y8', 'away_player_Y9',
'away_player_Y10', 'away_player_Y11'],1)
In [28]:
data = gl.SFrame(data)
data.show()
In [29]:
train_data,test_data = data.random_split(.8, seed=0)
train,valid=train_data.random_split(.8,seed=0)
model = gl.logistic_classifier.create(train,target='Output',
features=['country_id','league_id','season','stage','date','match_api_id',
'home_team_api_id','away_team_api_id','B365H','B365D','B365A',
'formation_h','formation_a'], validation_set=valid)
In [30]:
model.evaluate(test_data)
Out[30]:
In [31]:
pred = model.predict(test_data)
(test_data['Output']==pred).sum()/(len(pred)*1.0)
Out[31]:
In [40]:
def ExtractGoalTendency(values):
#data = values.to_dataframe()
values.sort_values(by=['league_id','season','stage'])
return values
data_goals = ExtractGoalTendency(data.to_dataframe())
data_goals
Out[40]: