Forword OLS

  • Import Package
  • Connect DB & get Forword Player Data
  • Scaling
  • Summary OLS
  • Remove Feature

Import Package


In [25]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

Connect DB & get Forword Player Data


In [26]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
        FROM player
        WHERE position like "%{position}%" and position not like "%D%" and mins > 270
        ;
    """.format(position=position)
    
    return SQL_QUERY

# forword
SQL_QUERY = make_query("F")
forword_df = pd.read_sql(SQL_QUERY, db)

len(forword_df)


Out[26]:
611
Scaling

In [27]:
X = forword_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(forword_df.ix[:,-1], columns=["rating"])
f_df = pd.concat([dfX, dfy], axis=1)
f_df.head()


Out[27]:
const age tall weight apps_start apps_sub mins goals assists yel ... blocks owng keyp_x fouled off disp unstch avgp ps_y rating
0 1 8.382932 9.541107 6.649772 2.985339 0.820926 3.276870 3.344336 2.398933 3.995343 ... 1.313059 0.0 2.015096 3.009618 3.162057 3.701969 3.353454 2.286773 8.088244 7.37
1 1 6.945858 9.699247 7.181753 2.686805 0.615694 2.803518 1.170518 2.398933 4.439270 ... 1.313059 0.0 1.612077 3.762023 1.149839 2.005233 2.624442 3.353934 8.501999 7.25
2 1 6.945858 9.699247 7.093090 0.298534 1.231389 0.341144 0.000000 0.000000 0.000000 ... 2.626118 0.0 0.403019 0.451443 0.000000 1.388238 0.874814 1.088940 9.783304 6.22
3 1 5.987809 9.224827 6.383781 3.582407 0.410463 3.609751 3.678769 1.999111 2.219635 ... 1.313059 0.0 2.821134 1.504809 1.149839 2.005233 2.624442 3.887515 10.450651 7.50
4 1 7.664395 9.751960 7.004426 1.691692 2.668009 1.997285 1.839385 1.599289 1.331781 ... 1.313059 0.0 1.410567 1.805771 1.437298 2.159482 2.332838 1.295838 8.795631 6.98

5 rows × 31 columns

Summary OLS

In [28]:
model = sm.OLS(f_df.ix[:, -1], f_df.ix[:, :-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.919
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     234.8
Date:                Sun, 26 Jun 2016   Prob (F-statistic):          5.79e-296
Time:                        17:27:52   Log-Likelihood:                 461.08
No. Observations:                 611   AIC:                            -864.2
Df Residuals:                     582   BIC:                            -736.1
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6388      0.091     62.193      0.000         5.461     5.817
age           -0.0091      0.005     -1.700      0.090        -0.020     0.001
tall           0.0084      0.008      1.100      0.272        -0.007     0.023
weight         0.0038      0.008      0.474      0.635        -0.012     0.020
apps_start     0.0661      0.049      1.357      0.175        -0.030     0.162
apps_sub   -8.385e-05      0.009     -0.010      0.992        -0.017     0.017
mins          -0.1498      0.049     -3.037      0.002        -0.247    -0.053
goals          0.1742      0.011     15.252      0.000         0.152     0.197
assists        0.0666      0.008      8.427      0.000         0.051     0.082
yel           -0.0201      0.006     -3.101      0.002        -0.033    -0.007
red           -0.0062      0.005     -1.249      0.212        -0.016     0.004
spg            0.0837      0.009      9.308      0.000         0.066     0.101
ps_x           0.0165      0.003      4.818      0.000         0.010     0.023
motm           0.0212      0.009      2.400      0.017         0.004     0.039
aw             0.0829      0.008     10.207      0.000         0.067     0.099
tackles        0.0656      0.007      8.841      0.000         0.051     0.080
inter          0.0489      0.007      6.859      0.000         0.035     0.063
fouls         -0.0192      0.007     -2.778      0.006        -0.033    -0.006
offsides      -0.0112      0.005     -2.212      0.027        -0.021    -0.001
clear          0.0051      0.007      0.758      0.449        -0.008     0.018
drb           -0.0179      0.006     -2.772      0.006        -0.031    -0.005
blocks         0.0056      0.005      1.038      0.300        -0.005     0.016
owng          -0.0046      0.005     -0.917      0.360        -0.014     0.005
keyp_x         0.0994      0.008     12.079      0.000         0.083     0.116
fouled         0.0396      0.007      5.871      0.000         0.026     0.053
off           -0.0101      0.006     -1.615      0.107        -0.022     0.002
disp           0.0097      0.007      1.341      0.180        -0.005     0.024
unstch        -0.0047      0.008     -0.627      0.531        -0.020     0.010
avgp           0.0326      0.010      3.383      0.001         0.014     0.052
ps_y           0.0165      0.003      4.818      0.000         0.010     0.023
==============================================================================
Omnibus:                       44.878   Durbin-Watson:                   1.984
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              105.265
Skew:                           0.398   Prob(JB):                     1.39e-23
Kurtosis:                       4.871   Cond. No.                     9.04e+15
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.07e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
Remove Some Feature

In [37]:
# remove features
remove_column_list = [
    "age", "tall", "weight", "apps_start", "apps_sub", "red", "clear", "blocks", "owng", "unstch", "offsides", "disp", "off"
]
removed_f_df = f_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_f_df.ix[:, -1], removed_f_df.ix[:, :-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.915
Model:                            OLS   Adj. R-squared:                  0.913
Method:                 Least Squares   F-statistic:                     429.4
Date:                Sun, 26 Jun 2016   Prob (F-statistic):          2.74e-307
Time:                        17:33:18   Log-Likelihood:                 449.12
No. Observations:                 611   AIC:                            -866.2
Df Residuals:                     595   BIC:                            -795.6
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6896      0.067     84.360      0.000         5.557     5.822
mins          -0.0836      0.009     -8.811      0.000        -0.102    -0.065
goals          0.1689      0.011     15.172      0.000         0.147     0.191
assists        0.0665      0.008      8.568      0.000         0.051     0.082
yel           -0.0228      0.006     -3.603      0.000        -0.035    -0.010
spg            0.0836      0.009      9.704      0.000         0.067     0.100
ps_x           0.0156      0.003      4.572      0.000         0.009     0.022
motm           0.0235      0.009      2.711      0.007         0.006     0.041
aw             0.0802      0.007     11.491      0.000         0.066     0.094
tackles        0.0691      0.007      9.406      0.000         0.055     0.084
inter          0.0469      0.007      7.214      0.000         0.034     0.060
fouls         -0.0153      0.007     -2.338      0.020        -0.028    -0.002
drb           -0.0165      0.006     -2.563      0.011        -0.029    -0.004
keyp_x         0.1012      0.008     12.456      0.000         0.085     0.117
fouled         0.0385      0.006      6.022      0.000         0.026     0.051
avgp           0.0347      0.009      3.755      0.000         0.017     0.053
ps_y           0.0156      0.003      4.572      0.000         0.009     0.022
==============================================================================
Omnibus:                       45.982   Durbin-Watson:                   1.973
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              104.109
Skew:                           0.421   Prob(JB):                     2.47e-23
Kurtosis:                       4.839   Cond. No.                     4.26e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 7.4e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [39]:
formula_str = """
rating ~ mins + goals + assists + yel + spg + ps_x
+ motm + aw + tackles + inter + fouls + drb + keyp_x
+ fouled + avgp + ps_y
"""

model = sm.OLS.from_formula(formula_str, data=removed_f_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova


Out[39]:
df sum_sq mean_sq F PR(>F)
mins 1.0 44.227823 44.227823 3199.652899 1.467266e-241
goals 1.0 14.209164 14.209164 1027.959101 9.237160e-132
assists 1.0 7.687434 7.687434 556.145853 2.536368e-87
yel 1.0 0.116096 0.116096 8.398942 3.892666e-03
spg 1.0 5.702216 5.702216 412.525675 4.544223e-70
ps_x 1.0 0.716860 0.716860 51.861086 1.807262e-12
motm 1.0 2.835558 2.835558 205.137890 3.429509e-40
aw 1.0 2.351298 2.351298 170.104197 2.240872e-34
tackles 1.0 5.572391 5.572391 403.133490 7.421342e-69
inter 1.0 1.060663 1.060663 76.733450 2.031578e-17
fouls 1.0 0.004059 0.004059 0.293674 5.880784e-01
drb 1.0 0.007110 0.007110 0.514381 4.735300e-01
keyp_x 1.0 3.789535 3.789535 274.153114 6.326110e-51
fouled 1.0 0.551940 0.551940 39.930013 5.162150e-10
avgp 1.0 0.194879 0.194879 14.098477 1.905189e-04
ps_y 1.0 0.008867 0.008867 0.641490 4.234913e-01
Residual 595.0 8.224503 0.013823 NaN NaN

In [40]:
# remove feature 2
remove_column_list = [
    "fouls", "drb", "ps_y"
]
removed2_f_df = removed_f_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_f_df.ix[:, -1], removed2_f_df.ix[:, :-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.914
Model:                            OLS   Adj. R-squared:                  0.912
Method:                 Least Squares   F-statistic:                     486.5
Date:                Sun, 26 Jun 2016   Prob (F-statistic):          1.93e-307
Time:                        17:33:54   Log-Likelihood:                 443.11
No. Observations:                 611   AIC:                            -858.2
Df Residuals:                     597   BIC:                            -796.4
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6694      0.068     83.946      0.000         5.537     5.802
mins          -0.0838      0.010     -8.803      0.000        -0.102    -0.065
goals          0.1726      0.011     15.448      0.000         0.151     0.195
assists        0.0662      0.008      8.461      0.000         0.051     0.082
yel           -0.0280      0.006     -4.658      0.000        -0.040    -0.016
spg            0.0801      0.008      9.458      0.000         0.063     0.097
ps_x           0.0319      0.007      4.640      0.000         0.018     0.045
motm           0.0248      0.009      2.841      0.005         0.008     0.042
aw             0.0769      0.007     11.273      0.000         0.063     0.090
tackles        0.0593      0.007      8.766      0.000         0.046     0.073
inter          0.0452      0.007      6.918      0.000         0.032     0.058
keyp_x         0.1016      0.008     12.432      0.000         0.086     0.118
fouled         0.0331      0.006      5.391      0.000         0.021     0.045
avgp           0.0329      0.009      3.613      0.000         0.015     0.051
==============================================================================
Omnibus:                       47.274   Durbin-Watson:                   1.982
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              120.209
Skew:                           0.394   Prob(JB):                     7.89e-27
Kurtosis:                       5.025   Cond. No.                         155.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [1]:
# forword
# goals
# keyp_x : 키 패스
# spg : 게임당 슈팅수
# assists
# aw : 공중볼 경합승리
# fouled : 파울당한 횟수