In [167]:
#points matplot lib to plot the figure inside notebook
#pandas is a data science library
#matplotlib: for data plotting
#matplotlib.pyplot : to define a plot, part of
%matplotlib inline
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn import tree
In [168]:
#pandas work with data frames
#read_csv
df = pd.read_csv('ScoreBoardFinal.csv')
df.head(10)
Out[168]:
In [169]:
#.loc returns the rows based on the filters or conditions
df_tmp = df.loc[df['Home_ShotsT'] != 'XX-XX-']
df_tmp.head(10)
Out[169]:
In [170]:
df = df_tmp.ix[:,'Home_Poss':]
df = df.apply(lambda x: pd.to_numeric(x, errors = 'ignore'))
names = df.columns
correlations = df.corr()
# plot correlation matrix
correlations
Out[170]:
In [171]:
fig = plt.figure(figsize=(35,35))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,28,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()
In [172]:
df = df_tmp
df['Home_Team_Factor'] = 1
df['Away_Team_Factor'] = -1
df.head()
Out[172]:
In [173]:
df['Winning_Team'] = 0
df.loc[df.Home_Team_Goals == df.Away_Team_Goals,'Winning_Team'] = 0
df.loc[df.Home_Team_Goals > df.Away_Team_Goals,'Winning_Team'] = 1
df.loc[df.Home_Team_Goals < df.Away_Team_Goals,'Winning_Team'] = 2
df.head()
Out[173]:
In [174]:
cols = ['MId', 'Home_Team', 'Home_Team_Goals', 'Away_Team_Goals','Away_Team', 'Home_Poss', 'Away_Poss', 'Home_ShotsT','Away_ShotsT', 'Home_Shots', 'Away_Shots', 'Home_Touches','Away_Touches', 'Home_Passes', 'Away_Passes', 'Home_Tackles','Away_Tackles', 'Home_Clearances', 'Away_Clearances','Home_Corners', 'Away_Corners', 'Home_Offsides', 'Away_Offsides','HTP','ATP', 'HTR','ATR', 'HTAR', 'ATAR', 'HTMR', 'ATMR', 'HTDR','ATDR', 'Home_Team_Factor', 'Away_Team_Factor','Winning_Team']
df_final = df[cols]
df_final.head()
Out[174]:
In [175]:
df_final = df_final.loc[:,'Home_Poss':]
df_final.head()
Out[175]:
In [176]:
df_fn = df_final.apply(lambda x : pd.to_numeric(x,errors='ignore'))
type(df_fn['Home_Poss'][0])
Out[176]:
In [177]:
#conversion of dataframe to a vector(Numpy Array)
arr_df = df_fn.values
arr_df.shape
Out[177]:
In [178]:
y_train = arr_df[:,-1]
y_train
y_train.shape
Out[178]:
Removing the "Winning_Team Column" to prepare the X-Train Values
In [179]:
x_train = arr_df[:,:-1]
x_train.shape
Out[179]:
In [180]:
x_htrain = x_train[:,0::2]
x_htrain.shape
Out[180]:
In [181]:
x_atrain = x_train[:,1::2]
x_atrain.shape
Out[181]:
In [182]:
x_atrain[0,-1]
Out[182]:
In [183]:
x_train = x_htrain - x_atrain
x_train
Out[183]:
In [184]:
x_train.shape
Out[184]:
In [185]:
#model = linear_model.LogisticRegression()
#model = svm.SVC()
#model = svm.SVC(kernel='poly',degree=2)
#model = tree.DecisionTreeRegressor()
#model = tree.DecisionTreeClassifier()
#model = linear_model.Lasso()
#model = GradientBoostingClassifier(n_estimators=100)
#model = AdaBoostClassifier(n_estimators=100)
#model = RandomForestClassifier(n_estimators=64)
model = GradientBoostingRegressor(n_estimators=100, max_depth=5)
#model = linear_model.BayesianRidge()
In [186]:
def showFeatureImportance(my_categories):
fx_imp = pd.Series(model.feature_importances_, index=my_categories)
fx_imp /= fx_imp.max()
fx_imp.sort_values(inplace = True)
fx_imp.plot(kind='barh')
In [187]:
def regressor(preds):
preds[preds >= 1.5] = 2
preds[preds < .50] = 0
preds[np.logical_and(preds >= .50,preds < 1.5)] = 1
return preds
In [188]:
categories = ['Poss', 'ShotsT', 'Shots','Touches', 'Passes', 'Tackles', 'Clearances','Corners', 'Offsides','TP', 'TR', 'TAR', 'TMR', 'TDR', 'Team_Factor']
accuracy=[]
for i in range(1):
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train)
print X_test.shape
results = model.fit(X_train, Y_train)
preds = model.predict(X_test)
print preds
#test = preds >= 0.55
#print test
preds = regressor(preds)
accuracy.append(np.mean(preds == Y_test))
#accuracy.append(np.mean(predictions == Y_test))
print "Finished iteration:", i
print preds
print Y_test
print "The accuracy is", sum(accuracy)/len(accuracy)
showFeatureImportance(categories)