In [26]:
####Import libraries
# SQL
import sqlite3
# Data Manipulation
import numpy as np
import pandas as pd
# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno #NaNs
#Nice Tables
from ipy_table import *
# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
%matplotlib inline
In [29]:
#load data (make sure you have downloaded database.sqlite)
with sqlite3.connect(r'C:/Python/Soccer1/database.sqlite') as con:
country_df = pd.read_sql_query("SELECT * from Country", con)
matches_df = pd.read_sql_query("SELECT * from Match", con)
league_df = pd.read_sql_query("SELECT * from League", con)
team_df = pd.read_sql_query("SELECT * from Team", con)
player_df = pd.read_sql_query("select * from Player", con)
player_stats_df = pd.read_sql_query("select * from player_attributes", con)
#load past results
super_table=[["Name", 'Regression function', "Train Acc", 'Validation Acc', "r2_score", "conf_matrix", 'Cross-Valid.'],]
df_load = pd.read_csv(r'C:/Python/Soccer/out/results_table.csv')
for row in df_load.values.tolist():
super_table.append(row)
In [3]:
player_vector = ['home_player_1', 'home_player_2',
'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
'away_player_8', 'away_player_9', 'away_player_10',
'away_player_11']
for player in player_vector:
player_stat = pd.merge(matches_df, player_stats_df, left_on = player,
right_on = "player_api_id")
matches_df[player+'_overall_rating'] = player_stat['overall_rating'].copy()
In [4]:
all_df = [country_df, matches_df, league_df, team_df, player_df, player_stats_df]
country_df.dfname = 'country_df'
matches_df.dfname = 'matches_df'
league_df.dfname = 'league_df'
team_df.dfname = 'team_df'
player_df.dfname = 'player_df'
player_stats_df.dfname = 'player_stats_df'
In [6]:
for df in all_df:
print(df.dfname)
print(df.info())
print('-'*40)
In [5]:
msno.matrix(matches_df)
In [6]:
matches_df.dropna(thresh=80, inplace=True)
In [7]:
msno.matrix(matches_df)
In [9]:
msno.dendrogram(matches_df)
In [11]:
matches_df.hist(figsize = (100, 100))
plt.show()
In [5]:
bookies = ['B365', 'BW', 'IW', 'LB', 'PS', 'WH', 'SJ', 'VC', 'GB', 'BS']
bookies_H = [bookie+'H' for bookie in bookies]
bookies_A = [bookie+'A' for bookie in bookies]
bookies_D = [bookie+'D' for bookie in bookies]
bookies_types = {'Home odds':bookies_H, 'Draw odds':bookies_D, 'Away odds':bookies_A}
In [13]:
for home, draw, away in zip(bookies_H,bookies_A,bookies_D):
fig, ax = plt.subplots()
for odds in [home, draw, away]:
sns.distplot(matches_df[odds].dropna(), ax=ax, label=odds, hist = False)
#set title
plt.title(home[:-1], fontsize=16)
#remove x label
ax.set_xlabel('')
ax.set_xlim([0, 8])
plt.show()
In [14]:
plt.rc('legend',fontsize=30)
#_______All bookmakers - Home/Draw/Away odds | KDE + BOXPLOTS
bookies_types = {'Home odds':bookies_H, 'Draw odds':bookies_D, 'Away odds':bookies_A}
for bookie_type, bookie_list in bookies_types.items():
fig, axes = plt.subplots(ncols=2, figsize=(30,15))
axes[0].set_xlim([0, 8])
if bookie_type=='Home odds':
axes[0].set_ylim([0, 0.65])
elif bookie_type =='Draw odds':
axes[0].set_ylim([0, 2.3])
else:
axes[0].set_ylim([0, 0.35])
for bookie in bookie_list:
sns.distplot(matches_df[bookie].dropna(), ax = axes[0], label=bookie, hist = False)
#remove x label
axes[0].set_xlabel('')
#locate legend
plt.legend(loc='best')
col_sel = bookie_list
bookie_sel_df = matches_df[bookie_list]
axes[1] = sns.boxplot(data=bookie_sel_df, palette='Set2', showmeans=True)
if bookie_type=='Home odds':
axes[1].set_ylim([1, 5])
elif bookie_type =='Draw odds':
axes[1].set_ylim([1, 10])
else:
axes[1].set_ylim([1, 5.5])
plt.suptitle(str(bookie_type), fontsize=80)
plt.show()
In [15]:
#How many goals - Home\Away
goals_df = matches_df[['home_team_goal', 'away_team_goal']]
color = ['red', 'lime']
fig, ax = plt.subplots()
ax.set_xlim([0, 10])
ax.set_ylim([0, 9500])
sns.distplot(goals_df.dropna(), ax = ax, kde = False, color = color)
plt.show()
#Contingency table
goals_home_vs_away = pd.crosstab(index = matches_df["home_team_goal"],
columns = matches_df["away_team_goal"])
print(goals_home_vs_away)
In [6]:
#preprocessing libraries
from sklearn.preprocessing import LabelEncoder, StandardScaler, Imputer, OneHotEncoder
In [18]:
def impute_numeric (dataset, formula):
"""
Impute numeric values in a dataset usinng linear regression
dataset = Pandas Dataframe
formula e.g. 'Y ~ X1 + X2'
"""
import statsmodels.formula.api as smf
import pandas as pd
lm = smf.ols(formula = formula, data = dataset)
res = lm.fit()
temp_train = dataset[pd.isnull(dataset).any(axis=1)].copy()
temp_train = temp_train.drop(formula.split(None, 1)[0], axis=1).copy()
var_pred = res.predict(temp_train)
var_pred = var_pred.round(decimals=0)
dataset[formula.split(None, 1)[0]].fillna(var_pred, inplace=True)
In [7]:
def label_win (row):
if row['home_team_goal'] > row['away_team_goal']:
return 'WIN'
if row['home_team_goal'] == row['away_team_goal']:
return 'DRAW'
if row['home_team_goal'] < row['away_team_goal']:
return 'LOSE'
matches_df['RESULT'] = matches_df.apply(lambda row: label_win(row), axis=1)
In [8]:
ml_matches_df = matches_df[[ 'country_id', 'league_id', 'season', 'stage', 'date',
'home_team_api_id', 'away_team_api_id', 'home_player_X1',
'home_player_X2', 'home_player_X3', 'home_player_X4',
'home_player_X5', 'home_player_X6', 'home_player_X7',
'home_player_X8', 'home_player_X9', 'home_player_X10',
'home_player_X11', 'away_player_X1', 'away_player_X2',
'away_player_X3', 'away_player_X4', 'away_player_X5',
'away_player_X6', 'away_player_X7', 'away_player_X8',
'away_player_X9', 'away_player_X10', 'away_player_X11',
'home_player_Y1', 'home_player_Y2', 'home_player_Y3',
'home_player_Y4', 'home_player_Y5', 'home_player_Y6',
'home_player_Y7', 'home_player_Y8', 'home_player_Y9',
'home_player_Y10', 'home_player_Y11', 'away_player_Y1',
'away_player_Y2', 'away_player_Y3', 'away_player_Y4',
'away_player_Y5', 'away_player_Y6', 'away_player_Y7',
'away_player_Y8', 'away_player_Y9', 'away_player_Y10',
'away_player_Y11', 'home_player_1', 'home_player_2',
'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
'away_player_8', 'away_player_9', 'away_player_10',
'away_player_11', 'B365H', 'B365D', 'B365A', 'BWH',
'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'PSH',
'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'SJH', 'SJD', 'SJA', 'VCH',
'VCD', 'VCA', 'GBH', 'GBD', 'GBA', 'BSH', 'BSD', 'BSA', 'home_player_1_overall_rating', 'home_player_2_overall_rating',
'home_player_3_overall_rating', 'home_player_4_overall_rating',
'home_player_5_overall_rating', 'home_player_6_overall_rating',
'home_player_7_overall_rating', 'home_player_8_overall_rating',
'home_player_9_overall_rating', 'home_player_10_overall_rating',
'home_player_11_overall_rating', 'away_player_1_overall_rating',
'away_player_2_overall_rating', 'away_player_3_overall_rating',
'away_player_4_overall_rating', 'away_player_5_overall_rating',
'away_player_6_overall_rating', 'away_player_7_overall_rating',
'away_player_8_overall_rating', 'away_player_9_overall_rating',
'away_player_10_overall_rating', 'away_player_11_overall_rating', 'RESULT']]
In [9]:
#dropping NaNs everywhere where it's <300 records
ml_matches_df.dropna(subset = ['B365H', 'BWH', 'IWH', 'WHH', 'VCH', 'LBH', 'home_team_api_id', 'away_team_api_id',
'home_player_X1', 'home_player_X2', 'home_player_X3', 'home_player_X4',
'home_player_X5', 'home_player_X6', 'home_player_X7', 'home_player_X8',
'home_player_X9', 'home_player_X10', 'home_player_X11', 'away_player_X1',
'away_player_X2', 'away_player_X3', 'away_player_X4', 'away_player_X5',
'away_player_X6', 'away_player_X7', 'away_player_X8', 'away_player_X9',
'away_player_X10', 'away_player_X11', 'home_player_Y1', 'home_player_Y2',
'home_player_Y3', 'home_player_Y4', 'home_player_Y5', 'home_player_Y6',
'home_player_Y7', 'home_player_Y8', 'home_player_Y9', 'home_player_Y10',
'home_player_Y11', 'away_player_Y1', 'away_player_Y2', 'away_player_Y3',
'away_player_Y4', 'away_player_Y5', 'away_player_Y6', 'away_player_Y7',
'away_player_Y8', 'away_player_Y9', 'away_player_Y10', 'away_player_Y11',
'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
'away_player_8', 'away_player_9', 'away_player_10','away_player_11',
'home_player_1', 'home_player_2', 'home_player_1_overall_rating', 'home_player_2_overall_rating',
'home_player_3_overall_rating', 'home_player_4_overall_rating',
'home_player_5_overall_rating', 'home_player_6_overall_rating',
'home_player_7_overall_rating', 'home_player_8_overall_rating',
'home_player_9_overall_rating', 'home_player_10_overall_rating',
'home_player_11_overall_rating', 'away_player_1_overall_rating',
'away_player_2_overall_rating', 'away_player_3_overall_rating',
'away_player_4_overall_rating', 'away_player_5_overall_rating',
'away_player_6_overall_rating', 'away_player_7_overall_rating',
'away_player_8_overall_rating', 'away_player_9_overall_rating',
'away_player_10_overall_rating', 'away_player_11_overall_rating'
], how='any', inplace=True)
In [13]:
ml_matches_df.info()
In [10]:
#imputation for less popular bookies
bookies_types = {'Home odds':bookies_H, 'Draw odds':bookies_D, 'Away odds':bookies_A}
for bookie_type, bookie_list in bookies_types.items():
bookie_sel_df = ml_matches_df[bookie_list]
bookie_sel_df = bookie_sel_df.apply(lambda x: x.fillna(x.mean()),axis=1)
ml_matches_df[bookie_list]=bookie_sel_df.copy()
In [11]:
msno.matrix(ml_matches_df)
In [13]:
matches_data = ml_matches_df.drop('RESULT', 1)
matches_target = ml_matches_df[['RESULT']]
In [14]:
ml_matches_df.shape
Out[14]:
In [15]:
matches_data.shape
Out[15]:
In [16]:
matches_target.shape
Out[16]:
In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(matches_data, np.ravel(matches_target), test_size = 0.20, random_state = 101)
In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
clf = clf.fit(X_train, y_train)
Features based on tree
In [32]:
features_tree = pd.DataFrame()
features_tree['feature'] = matches_data.columns
features_tree['importance'] = clf.feature_importances_
features_tree.sort_values(by=['importance'], ascending=True, inplace=True)
features_tree.set_index('feature', inplace=True)
Features based on chi-square
In [18]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
features_chi = pd.DataFrame()
features_chi['feature'] = matches_data.columns
#model = SelectKBest(chi2, k=125)
#fit = model.fit(X_train, y_train)
#np.set_printoptions(precision=3)
#print(fit.scores_)
features_chi2 = chi2(X_train, y_train)
In [19]:
features_chi['importance'] = features_chi2[0]
In [20]:
features_chi.sort_values(by=['importance'], ascending=True, inplace=True)
features_chi.set_index('feature', inplace=True)
In [25]:
features_chi.plot(kind='barh', figsize=(20, 20))
Out[25]:
In [34]:
features_tree.plot(kind='barh', figsize=(20, 20))
Out[34]:
Reducing dataset to only relevant features
In [22]:
X_train_reduced = model.transform(X_train)
In [82]:
X_train_reduced.shape
Out[82]:
In [83]:
X_test_reduced = model.transform(X_test)
In [84]:
X_test_reduced.shape
Out[84]:
In [85]:
from sklearn.linear_model import LogisticRegression
In [86]:
param_grid = {'C': [0.1, 1, 10, 100, 1000]}
In [87]:
from sklearn.model_selection import GridSearchCV
In [88]:
grid = GridSearchCV(LogisticRegression(), param_grid, refit=True, verbose = 3)
In [89]:
grid.fit(X_train_reduced, y_train)
Out[89]:
In [90]:
grid.best_params_
Out[90]:
In [91]:
grid.best_estimator_
Out[91]:
In [92]:
grid_predictions = grid.predict(X_test_reduced)
In [95]:
from sklearn.metrics import classification_report, confusion_matrix
In [96]:
print(confusion_matrix(y_test, grid_predictions))
In [97]:
print(classification_report(y_test, grid_predictions))
In [12]:
#labelling
le = LabelEncoder()
for col in ml_matches_df.columns.values:
# Encoding only categorical variables
if ml_matches_df[col].dtypes=='object':
# Using whole data to form an exhaustive list of levels
data=ml_matches_df[col].append(ml_matches_df[col])
le.fit(data.values)
ml_matches_df[col]=le.transform(ml_matches_df[col])
ml_matches_df['RESULT'] = ml_matches_df['RESULT'].astype(float)
In [27]:
#scale Features
scaled_features = StandardScaler(with_mean=True, with_std=True).fit_transform(ml_matches_df.values)
scaled_features_df = pd.DataFrame(scaled_features, index=ml_matches_df.index, columns=ml_matches_df.columns)
def remap_match_result(x):
if x < -0.5:
return -1
elif x < 0.5:
return 0
else:
return 1
scaled_features_df['RESULT'] = scaled_features_df['RESULT'].map(remap_match_result)
scaled_features_df.head()
Out[27]:
In [22]:
# machine learning models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
In [23]:
#Split data for train and test
def split_data(data, targ):
#set target for training
target = data[targ]
# Import the train_test_split method
from sklearn.model_selection import train_test_split
# Split data into train (3/4th of data) and test (1/4th of data)
return train_test_split(data, target, train_size = 0.75, random_state=0);
In [24]:
def regr_equation(logreg, train, target):
if type(model) is LogisticRegression:
coef = logreg.coef_[0]
intercept = "{:.2f}".format(logreg.intercept_[0])
else:
coef = logreg.coef_
intercept = "{:.2f}".format(logreg.intercept_)
output = target.name + ' = ' + str(intercept) + ' + '
coeff_df = pd.DataFrame(train.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
features = coeff_df['Feature'].tolist()
coefficients = coeff_df['Correlation'].tolist()
for coeff, feature in zip(coefficients, features):
coeff_str = "{:.2f}".format(coeff)
output += coeff_str + "*" + str(feature) + " + "
return output[:-3]
In [25]:
def confusion_matrix(model, X, y):
from sklearn.metrics import confusion_matrix
return confusion_matrix(y, model.predict(X))
In [26]:
def test_model (model, data, target, submission_name = None, test=None):
if test is None:
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
train, test, target_train, target_test = split_data(data, target)
train = train.drop('RESULT', 1)
test = test.drop('RESULT', 1)
model.fit(train, target_train)
#Calc parameters
if type(model) is LogisticRegression:
function_str = regr_equation(model, train, target_train)
elif type(model) is LinearRegression:
function_str = regr_equation(model, train, target_train)
else :
function_str = "NA"
if type(model) is not LinearRegression:
trainset_acc = round(accuracy_score(target_train, model.predict(train)) * 100, 2)
testset_acc = round(accuracy_score(target_test, model.predict(test)) * 100, 2)
conf_matrix = confusion_matrix(target_train, model.predict(train))
conf_matrix = 'TN: '+str(conf_matrix[0][0])+', FP: '+str(conf_matrix[0][1])+ \
', FN: '+str(conf_matrix[1][0])+', TP: '+str(conf_matrix[1][1])
else:
trainset_acc = 'NA'
testset_acc = 'NA'
conf_matrix = "NA"
r2_score = r2_score(target_train, model.predict(train))
kaggle = "not_tested"
#Perform k-fold cross-validation with 5 folds
from sklearn.cross_validation import KFold #For K-fold cross validation
kf = KFold(data.shape[0], n_folds=5)
error = []
predictors = list(data.columns.values)
del predictors[-1]
for train, test in kf:
# Filter training data
train_predictors = (data[predictors].iloc[train,:])
# The target we're using to train the algorithm.
train_target = data['RESULT'].iloc[train]
# Training the algorithm using the predictors and target.
model.fit(train_predictors, train_target)
#Record error from each cross-validation run
error.append(model.score(data[predictors].iloc[test,:], data['RESULT'].iloc[test]))
#Fit the model again so that it can be refered
#prints
print("-"*40)
print('Submission name:', submission_name )
print('Regression function:\n', function_str)
print('Accuracy on train set:', trainset_acc,"%")
print('Accuracy on test set:', testset_acc,"%")
print ("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))
print("R2 score:", r2_score)
print("Confusion matrix:\n", conf_matrix)
return [submission_name, function_str, str(trainset_acc), str(testset_acc), r2_score, conf_matrix, np.mean(error)]
In [28]:
#Setup
suffix = '_droppedMeaningless_noScaler_addedParams_playerOverallRating'
modelMLP = MLPClassifier(hidden_layer_sizes=(104, 104, 104), activation='logistic', solver='sgd', alpha=0.0001,
batch_size='auto', learning_rate='adaptive', learning_rate_init=1, power_t=0.5,
max_iter=2000, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False,
momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1,
beta_1=0.9, beta_2=0.999, epsilon=1e-08)
modelLogReg = LogisticRegression(n_jobs = -1)
modelLinReg = LinearRegression()
modelSVC = SVC()
modellinSVC = LinearSVC()
modelKN = KNeighborsClassifier(n_neighbors = 3)
modelGNB = GaussianNB()
modelPercp = Perceptron()
modelSGD = SGDClassifier()
modelTree = DecisionTreeClassifier(criterion='gini', splitter='random', max_depth=None,
min_samples_split=16, min_samples_leaf=8, min_weight_fraction_leaf=0.0,
max_features='sqrt', random_state=None, max_leaf_nodes=None,
min_impurity_split=1e-07, class_weight=None, presort=False)
modelRndForest = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=None, min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt',
max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False,
n_jobs=-1, random_state=None, verbose=0, warm_start=False, class_weight=None)
modelXTree = ExtraTreesClassifier(n_estimators=500, criterion='gini', max_depth=None, min_samples_split=16,
min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features='sqrt',
max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=False, oob_score=False,
n_jobs=-1, random_state=None, verbose=0, warm_start=False, class_weight=None)
models = {
modelRndForest: "RandomForest",
modelLogReg : "logreg",
modelXTree : 'xTrees'
}
In [30]:
outputs = []
for model, prefix in models.items():
name=prefix+suffix
outputs.append(test_model(model, ml_matches_df, target='RESULT', submission_name=name))
outputs.sort(key=lambda x: x[0])
In [31]:
for out in outputs:
super_table.append(out)
In [30]:
make_table(super_table)
apply_theme('basic')
Out[30]:
In [35]:
df = pd.DataFrame(super_table[1:], columns=super_table[0])
df.to_csv(r'C:/Python/Soccer/out/results_table.csv',
index=False)