Goalkeepr OLS

  • Import Package
  • Connect DB & get Forword Player Data
  • Scaling
  • Summary OLS
  • Remove Feature
  • Anova & Remove Feature

Import Package


In [2]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

Connect DB & get Forword Player Data


In [3]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
        FROM player
        WHERE position like "%{position}%"
        ;
    """.format(position=position)
    
    return SQL_QUERY

# goalkeeper
SQL_QUERY = make_query("G")
goalkeeper_df = pd.read_sql(SQL_QUERY, db)

len(goalkeeper_df)


Out[3]:
289
Scaling

In [4]:
X = goalkeeper_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(goalkeeper_df.ix[:,-1], columns=["rating"])
g_df = pd.concat([dfX, dfy], axis=1)
g_df.head()


Out[4]:
age tall weight apps_start apps_sub mins goals assists yel red ... blocks owng keyp_x fouled off disp unstch avgp ps_y rating
0 7.432029 15.948156 11.867021 2.703886 0.000000 2.646419 0.0 0.0 0.816273 3.232331 ... 0.0 0.0 1.329329 0.678815 0.0 0.0 0.000000 4.708980 3.971232 6.62
1 5.945623 15.614164 12.139826 0.073078 2.104282 0.141947 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.000000 0.000000 0.0 0.0 0.000000 5.616214 3.558452 5.80
2 4.883905 15.697662 11.321411 2.776964 0.000000 2.789998 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.000000 0.678815 0.0 0.0 1.120947 2.692903 3.316477 6.83
3 7.007342 15.363669 10.912203 2.338496 0.000000 2.347841 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.000000 0.000000 0.0 0.0 0.000000 3.844947 5.999550 6.92
4 5.096249 15.614164 11.594216 0.438468 2.104282 0.442158 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 1.329329 0.000000 0.0 0.0 0.000000 3.110519 5.935498 6.48

5 rows × 30 columns

Summary OLS

In [5]:
model = sm.OLS(g_df.ix[:, -1], g_df.ix[:, :-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.995
Model:                            OLS   Adj. R-squared:                  0.994
Method:                 Least Squares   F-statistic:                     2027.
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          5.85e-286
Time:                        11:04:54   Log-Likelihood:                -197.13
No. Observations:                 289   AIC:                             444.3
Df Residuals:                     264   BIC:                             535.9
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
age            0.0893      0.031      2.919      0.004         0.029     0.150
tall           0.3709      0.038      9.836      0.000         0.297     0.445
weight        -0.0829      0.051     -1.639      0.102        -0.183     0.017
apps_start    -0.4496      1.674     -0.269      0.788        -3.746     2.847
apps_sub       0.0420      0.042      1.009      0.314        -0.040     0.124
mins           0.4503      1.667      0.270      0.787        -2.833     3.733
goals         -0.0014      0.033     -0.043      0.966        -0.067     0.064
assists       -0.0067      0.031     -0.218      0.828        -0.067     0.054
yel           -0.0371      0.038     -0.980      0.328        -0.112     0.037
red           -0.0309      0.038     -0.813      0.417        -0.106     0.044
spg           -0.0048      0.034     -0.143      0.886        -0.071     0.062
ps_x           0.0620      0.016      3.998      0.000         0.031     0.093
motm           0.0585      0.036      1.626      0.105        -0.012     0.129
aw             0.1233      0.035      3.531      0.000         0.055     0.192
tackles       -0.0597      0.031     -1.924      0.055        -0.121     0.001
inter          0.0159      0.030      0.525      0.600        -0.044     0.075
fouls         -0.0102      0.033     -0.305      0.760        -0.076     0.056
offsides    2.141e-17   1.13e-17      1.889      0.060     -9.08e-19  4.37e-17
clear          0.1396      0.033      4.222      0.000         0.075     0.205
drb           -0.0764      0.030     -2.511      0.013        -0.136    -0.016
blocks        1.6e-17   1.06e-17      1.511      0.132     -4.85e-18  3.68e-17
owng          -0.0290      0.030     -0.969      0.334        -0.088     0.030
keyp_x        -0.0113      0.031     -0.362      0.718        -0.073     0.050
fouled        -0.0272      0.035     -0.784      0.434        -0.096     0.041
off        -8.592e-19   2.12e-18     -0.405      0.686     -5.04e-18  3.32e-18
disp          -0.0310      0.031     -1.012      0.313        -0.091     0.029
unstch        -0.0004      0.031     -0.013      0.990        -0.060     0.060
avgp           0.1137      0.035      3.212      0.001         0.044     0.183
ps_y           0.0620      0.016      3.998      0.000         0.031     0.093
==============================================================================
Omnibus:                      288.295   Durbin-Watson:                   2.005
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            17045.089
Skew:                           3.828   Prob(JB):                         0.00
Kurtosis:                      39.836   Cond. No.                     9.51e+15
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.49e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
Remove Some Feature

In [6]:
# remove features
remove_column_list = [
    "weight", "apps_start", "apps_sub", "mins", "goals", "assists", "yel", "red", "spg", "motm"
    , "tackles","inter", "fouls", "blocks", "owng", "keyp_x", "fouled", "off", "disp", "unstch"
]
removed_g_df = g_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_g_df.ix[:, -1], removed_g_df.ix[:, :-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.994
Model:                            OLS   Adj. R-squared:                  0.994
Method:                 Least Squares   F-statistic:                     7263.
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          2.67e-314
Time:                        11:04:55   Log-Likelihood:                -206.16
No. Observations:                 289   AIC:                             426.3
Df Residuals:                     282   BIC:                             452.0
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
age            0.0814      0.029      2.810      0.005         0.024     0.138
tall           0.3138      0.013     23.589      0.000         0.288     0.340
ps_x           0.0688      0.015      4.621      0.000         0.039     0.098
aw             0.1032      0.030      3.404      0.001         0.044     0.163
offsides    1.709e-17    3.8e-18      4.495      0.000      9.61e-18  2.46e-17
clear          0.1396      0.032      4.423      0.000         0.077     0.202
drb           -0.0787      0.030     -2.638      0.009        -0.137    -0.020
avgp           0.0997      0.032      3.125      0.002         0.037     0.163
ps_y           0.0688      0.015      4.621      0.000         0.039     0.098
==============================================================================
Omnibus:                      289.555   Durbin-Watson:                   1.992
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            17990.621
Skew:                           3.835   Prob(JB):                         0.00
Kurtosis:                      40.884   Cond. No.                     3.41e+17
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 8.23e-31. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
Anova & Remove Feature

In [7]:
formula_str = """
rating ~ age + tall + ps_x + aw + offsides + clear + drb +  avgp + ps_y
"""

model = sm.OLS.from_formula(formula_str, data=removed_g_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova


Out[7]:
df sum_sq mean_sq F PR(>F)
age 1.0 0.000003 0.000003 0.000022 0.996227
tall 1.0 0.234744 0.234744 1.853932 0.174417
ps_x 1.0 0.998707 0.998707 7.887450 0.005327
aw 1.0 1.994044 1.994044 15.748282 0.000092
offsides 1.0 0.036328 0.036328 0.286909 0.592632
clear 1.0 1.943786 1.943786 15.351360 0.000112
drb 1.0 0.776576 0.776576 6.133133 0.013856
avgp 1.0 0.973691 0.973691 7.689884 0.005925
ps_y 1.0 0.001614 0.001614 0.012749 0.910181
Residual 281.0 35.580157 0.126620 NaN NaN

In [8]:
# remove feature 2
remove_column_list = [
    "age", "offsides", "ps_y"
]
removed2_g_df = removed_g_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_g_df.ix[:, -1], removed2_g_df.ix[:, :-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.994
Model:                            OLS   Adj. R-squared:                  0.994
Method:                 Least Squares   F-statistic:                     8271.
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          1.38e-314
Time:                        11:04:58   Log-Likelihood:                -210.15
No. Observations:                 289   AIC:                             432.3
Df Residuals:                     283   BIC:                             454.3
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
tall           0.3411      0.009     37.102      0.000         0.323     0.359
ps_x           0.1426      0.030      4.741      0.000         0.083     0.202
aw             0.1071      0.031      3.495      0.001         0.047     0.167
clear          0.1424      0.032      4.459      0.000         0.080     0.205
drb           -0.0727      0.030     -2.417      0.016        -0.132    -0.013
avgp           0.1075      0.032      3.342      0.001         0.044     0.171
==============================================================================
Omnibus:                      309.888   Durbin-Watson:                   1.937
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            23372.669
Skew:                           4.230   Prob(JB):                         0.00
Kurtosis:                      46.237   Cond. No.                         22.0
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

TEST Predict


In [17]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(removed2_g_df.ix[:, :-1], removed2_g_df.ix[:, -1])


Out[17]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [10]:
w = model.coef_[1:]

In [21]:
N = 5
print(removed2_g_df.loc[N])
v = np.array(removed2_g_df.loc[N][1:-1])

print(w)
print(v)

# predict
print( (w*v).sum()*10 )


tall      14.946178
ps_x       3.800427
aw         1.366757
clear      1.558145
drb        0.818273
avgp       3.902549
rating     6.480000
Name: 5, dtype: float64
[ 0.06039669  0.06172685  0.06533551 -0.06013044  0.06325299]
[ 3.80042672  1.36675663  1.55814536  0.81827344  3.90254893]
6.13345738541

In [ ]:


In [ ]: