Goalkeepr OLS

Import Package
Connect DB & get Forword Player Data
Scaling
Summary OLS
Remove Feature
Anova & Remove Feature

Import Package



In [2]:

    
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

Connect DB & get Forword Player Data



In [3]:

    
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
        FROM player
        WHERE position like "%{position}%"
        ;
    """.format(position=position)
    
    return SQL_QUERY

# goalkeeper
SQL_QUERY = make_query("G")
goalkeeper_df = pd.read_sql(SQL_QUERY, db)

len(goalkeeper_df)









    Out[3]:





289

Scaling



In [4]:

    
X = goalkeeper_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(goalkeeper_df.ix[:,-1], columns=["rating"])
g_df = pd.concat([dfX, dfy], axis=1)
g_df.head()









    Out[4]:






  
    
      
      age
      tall
      weight
      apps_start
      apps_sub
      mins
      goals
      assists
      yel
      red
      ...
      blocks
      owng
      keyp_x
      fouled
      off
      disp
      unstch
      avgp
      ps_y
      rating
    
  
  
    
      0
      7.432029
      15.948156
      11.867021
      2.703886
      0.000000
      2.646419
      0.0
      0.0
      0.816273
      3.232331
      ...
      0.0
      0.0
      1.329329
      0.678815
      0.0
      0.0
      0.000000
      4.708980
      3.971232
      6.62
    
    
      1
      5.945623
      15.614164
      12.139826
      0.073078
      2.104282
      0.141947
      0.0
      0.0
      0.000000
      0.000000
      ...
      0.0
      0.0
      0.000000
      0.000000
      0.0
      0.0
      0.000000
      5.616214
      3.558452
      5.80
    
    
      2
      4.883905
      15.697662
      11.321411
      2.776964
      0.000000
      2.789998
      0.0
      0.0
      0.000000
      0.000000
      ...
      0.0
      0.0
      0.000000
      0.678815
      0.0
      0.0
      1.120947
      2.692903
      3.316477
      6.83
    
    
      3
      7.007342
      15.363669
      10.912203
      2.338496
      0.000000
      2.347841
      0.0
      0.0
      0.000000
      0.000000
      ...
      0.0
      0.0
      0.000000
      0.000000
      0.0
      0.0
      0.000000
      3.844947
      5.999550
      6.92
    
    
      4
      5.096249
      15.614164
      11.594216
      0.438468
      2.104282
      0.442158
      0.0
      0.0
      0.000000
      0.000000
      ...
      0.0
      0.0
      1.329329
      0.000000
      0.0
      0.0
      0.000000
      3.110519
      5.935498
      6.48
    
  

5 rows × 30 columns

Summary OLS



In [5]:

    
model = sm.OLS(g_df.ix[:, -1], g_df.ix[:, :-1])
result = model.fit()
print(result.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.995
Model:                            OLS   Adj. R-squared:                  0.994
Method:                 Least Squares   F-statistic:                     2027.
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          5.85e-286
Time:                        11:04:54   Log-Likelihood:                -197.13
No. Observations:                 289   AIC:                             444.3
Df Residuals:                     264   BIC:                             535.9
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
age            0.0893      0.031      2.919      0.004         0.029     0.150
tall           0.3709      0.038      9.836      0.000         0.297     0.445
weight        -0.0829      0.051     -1.639      0.102        -0.183     0.017
apps_start    -0.4496      1.674     -0.269      0.788        -3.746     2.847
apps_sub       0.0420      0.042      1.009      0.314        -0.040     0.124
mins           0.4503      1.667      0.270      0.787        -2.833     3.733
goals         -0.0014      0.033     -0.043      0.966        -0.067     0.064
assists       -0.0067      0.031     -0.218      0.828        -0.067     0.054
yel           -0.0371      0.038     -0.980      0.328        -0.112     0.037
red           -0.0309      0.038     -0.813      0.417        -0.106     0.044
spg           -0.0048      0.034     -0.143      0.886        -0.071     0.062
ps_x           0.0620      0.016      3.998      0.000         0.031     0.093
motm           0.0585      0.036      1.626      0.105        -0.012     0.129
aw             0.1233      0.035      3.531      0.000         0.055     0.192
tackles       -0.0597      0.031     -1.924      0.055        -0.121     0.001
inter          0.0159      0.030      0.525      0.600        -0.044     0.075
fouls         -0.0102      0.033     -0.305      0.760        -0.076     0.056
offsides    2.141e-17   1.13e-17      1.889      0.060     -9.08e-19  4.37e-17
clear          0.1396      0.033      4.222      0.000         0.075     0.205
drb           -0.0764      0.030     -2.511      0.013        -0.136    -0.016
blocks        1.6e-17   1.06e-17      1.511      0.132     -4.85e-18  3.68e-17
owng          -0.0290      0.030     -0.969      0.334        -0.088     0.030
keyp_x        -0.0113      0.031     -0.362      0.718        -0.073     0.050
fouled        -0.0272      0.035     -0.784      0.434        -0.096     0.041
off        -8.592e-19   2.12e-18     -0.405      0.686     -5.04e-18  3.32e-18
disp          -0.0310      0.031     -1.012      0.313        -0.091     0.029
unstch        -0.0004      0.031     -0.013      0.990        -0.060     0.060
avgp           0.1137      0.035      3.212      0.001         0.044     0.183
ps_y           0.0620      0.016      3.998      0.000         0.031     0.093
==============================================================================
Omnibus:                      288.295   Durbin-Watson:                   2.005
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            17045.089
Skew:                           3.828   Prob(JB):                         0.00
Kurtosis:                      39.836   Cond. No.                     9.51e+15
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.49e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

Remove Some Feature



In [6]:

    
# remove features
remove_column_list = [
    "weight", "apps_start", "apps_sub", "mins", "goals", "assists", "yel", "red", "spg", "motm"
    , "tackles","inter", "fouls", "blocks", "owng", "keyp_x", "fouled", "off", "disp", "unstch"
]
removed_g_df = g_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_g_df.ix[:, -1], removed_g_df.ix[:, :-1])
result = model.fit()
print(result.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.994
Model:                            OLS   Adj. R-squared:                  0.994
Method:                 Least Squares   F-statistic:                     7263.
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          2.67e-314
Time:                        11:04:55   Log-Likelihood:                -206.16
No. Observations:                 289   AIC:                             426.3
Df Residuals:                     282   BIC:                             452.0
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
age            0.0814      0.029      2.810      0.005         0.024     0.138
tall           0.3138      0.013     23.589      0.000         0.288     0.340
ps_x           0.0688      0.015      4.621      0.000         0.039     0.098
aw             0.1032      0.030      3.404      0.001         0.044     0.163
offsides    1.709e-17    3.8e-18      4.495      0.000      9.61e-18  2.46e-17
clear          0.1396      0.032      4.423      0.000         0.077     0.202
drb           -0.0787      0.030     -2.638      0.009        -0.137    -0.020
avgp           0.0997      0.032      3.125      0.002         0.037     0.163
ps_y           0.0688      0.015      4.621      0.000         0.039     0.098
==============================================================================
Omnibus:                      289.555   Durbin-Watson:                   1.992
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            17990.621
Skew:                           3.835   Prob(JB):                         0.00
Kurtosis:                      40.884   Cond. No.                     3.41e+17
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 8.23e-31. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

Anova & Remove Feature



In [7]:

    
formula_str = """
rating ~ age + tall + ps_x + aw + offsides + clear + drb +  avgp + ps_y
"""

model = sm.OLS.from_formula(formula_str, data=removed_g_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova



In [8]:

    
# remove feature 2
remove_column_list = [
    "age", "offsides", "ps_y"
]
removed2_g_df = removed_g_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_g_df.ix[:, -1], removed2_g_df.ix[:, :-1])
result = model.fit()
print(result.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.994
Model:                            OLS   Adj. R-squared:                  0.994
Method:                 Least Squares   F-statistic:                     8271.
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          1.38e-314
Time:                        11:04:58   Log-Likelihood:                -210.15
No. Observations:                 289   AIC:                             432.3
Df Residuals:                     283   BIC:                             454.3
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
tall           0.3411      0.009     37.102      0.000         0.323     0.359
ps_x           0.1426      0.030      4.741      0.000         0.083     0.202
aw             0.1071      0.031      3.495      0.001         0.047     0.167
clear          0.1424      0.032      4.459      0.000         0.080     0.205
drb           -0.0727      0.030     -2.417      0.016        -0.132    -0.013
avgp           0.1075      0.032      3.342      0.001         0.044     0.171
==============================================================================
Omnibus:                      309.888   Durbin-Watson:                   1.937
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            23372.669
Skew:                           4.230   Prob(JB):                         0.00
Kurtosis:                      46.237   Cond. No.                         22.0
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

TEST Predict



In [17]:

    
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(removed2_g_df.ix[:, :-1], removed2_g_df.ix[:, -1])









    Out[17]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [10]:

    
w = model.coef_[1:]



In [21]:

    
N = 5
print(removed2_g_df.loc[N])
v = np.array(removed2_g_df.loc[N][1:-1])

print(w)
print(v)

# predict
print( (w*v).sum()*10 )









    



tall      14.946178
ps_x       3.800427
aw         1.366757
clear      1.558145
drb        0.818273
avgp       3.902549
rating     6.480000
Name: 5, dtype: float64
[ 0.06039669  0.06172685  0.06533551 -0.06013044  0.06325299]
[ 3.80042672  1.36675663  1.55814536  0.81827344  3.90254893]
6.13345738541



In [ ]:



In [ ]:

	df	sum_sq	mean_sq	F	PR(>F)
age	1.0	0.000003	0.000003	0.000022	0.996227
tall	1.0	0.234744	0.234744	1.853932	0.174417
ps_x	1.0	0.998707	0.998707	7.887450	0.005327
aw	1.0	1.994044	1.994044	15.748282	0.000092
offsides	1.0	0.036328	0.036328	0.286909	0.592632
clear	1.0	1.943786	1.943786	15.351360	0.000112
drb	1.0	0.776576	0.776576	6.133133	0.013856
avgp	1.0	0.973691	0.973691	7.689884	0.005925
ps_y	1.0	0.001614	0.001614	0.012749	0.910181
Residual	281.0	35.580157	0.126620	NaN	NaN

	age	tall	weight	apps_start	apps_sub	mins	yel	red	...	keyp_x	fouled	unstch	avgp	ps_y	rating
0	7.432029	15.948156	11.867021	2.703886	0.000000	2.646419	0.816273	3.232331	...	1.329329	0.678815	0.000000	4.708980	3.971232	6.62
1	5.945623	15.614164	12.139826	0.073078	2.104282	0.141947	0.000000	0.000000	...	0.000000	0.000000	0.000000	5.616214	3.558452	5.80
2	4.883905	15.697662	11.321411	2.776964	0.000000	2.789998	0.000000	0.000000	...	0.000000	0.678815	1.120947	2.692903	3.316477	6.83
3	7.007342	15.363669	10.912203	2.338496	0.000000	2.347841	0.000000	0.000000	...	0.000000	0.000000	0.000000	3.844947	5.999550	6.92
4	5.096249	15.614164	11.594216	0.438468	2.104282	0.442158	0.000000	0.000000	...	1.329329	0.000000	0.000000	3.110519	5.935498	6.48