Defencer OLS

  • Import Package
  • Connect DB & get Defence Player Data
  • Scaling
  • Summary OLS
  • Remove Feature
  • Anova & Remove Feature

Import Package


In [2]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

Connect DB & get Forword Player Data


In [3]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating        
        FROM player
    """
    
    if position == "F":
        SQL_QUERY += """
            WHERE position not like "%,%" and position like "%FW%" and mins > 270
        """
    
    if position == "M":
        SQL_QUERY += """
            WHERE position not like "%,%" and position like "%M%" and mins > 270
        """
    
    if position == "D":
        SQL_QUERY += """
            WHERE position not like "%,%" and position like "%D%" and position not like " DMC"  and mins > 270
        """
    
    if position == "G":
        SQL_QUERY += """
            WHERE position not like "%,%" and position like "%G%" and mins > 270
        """
    
    return SQL_QUERY

# defencer
SQL_QUERY = make_query("D")
defenser_df = pd.read_sql(SQL_QUERY, db)

len(defenser_df)


Out[3]:
817
Scaling

In [4]:
X = defenser_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(defenser_df.ix[:,-1], columns=["rating"])
d_df = pd.concat([dfX, dfy], axis=1)
d_df.head()


Out[4]:
const age tall weight apps_start apps_sub mins goals assists yel ... blocks owng keyp_x fouled off disp unstch avgp ps_y rating
0 1 5.342523 15.105863 10.954348 2.615560 0.466727 2.632199 2.827191 0.900141 2.688382 ... 1.967895 2.427616 0.654447 2.379510 0.000000 0.672855 1.001993 4.325854 11.328504 6.93
1 1 6.799575 14.146761 9.665601 3.452539 0.466727 3.454909 0.000000 2.700424 2.304327 ... 0.327983 0.000000 2.290563 1.665657 0.000000 2.018565 2.337983 2.718860 10.961246 6.90
2 1 6.556733 14.466461 9.923350 1.569336 3.733816 1.738155 0.000000 0.000000 0.384055 ... 0.327983 0.000000 1.308893 0.951804 1.435429 1.009282 1.669988 3.083691 10.890619 6.89
3 1 6.799575 14.786162 10.181100 2.929427 0.933454 2.954387 0.000000 0.900141 1.920273 ... 2.295878 2.427616 0.654447 0.237951 1.435429 0.672855 1.001993 3.274793 11.342629 6.86
4 1 7.285259 14.546387 9.407851 3.347916 0.000000 3.369309 0.942397 3.600565 4.224600 ... 0.655965 0.000000 2.945009 2.141559 0.000000 2.691419 3.339975 4.508270 11.653386 7.60

5 rows × 31 columns

Summary OLS

In [5]:
model = sm.OLS(d_df.ix[:, -1], d_df.ix[:, :-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.767
Model:                            OLS   Adj. R-squared:                  0.759
Method:                 Least Squares   F-statistic:                     92.66
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          1.84e-227
Time:                        11:06:23   Log-Likelihood:                 525.03
No. Observations:                 817   AIC:                            -992.1
Df Residuals:                     788   BIC:                            -855.6
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.4647      0.108     50.706      0.000         5.253     5.676
age           -0.0031      0.005     -0.617      0.538        -0.013     0.007
tall          -0.0080      0.008     -1.015      0.310        -0.023     0.007
weight         0.0149      0.008      1.758      0.079        -0.002     0.032
apps_start     0.1055      0.069      1.537      0.125        -0.029     0.240
apps_sub       0.0161      0.007      2.295      0.022         0.002     0.030
mins          -0.0982      0.068     -1.451      0.147        -0.231     0.035
goals          0.0297      0.006      5.268      0.000         0.019     0.041
assists        0.0319      0.006      5.303      0.000         0.020     0.044
yel           -0.0056      0.006     -0.908      0.364        -0.018     0.007
red           -0.0275      0.005     -5.747      0.000        -0.037    -0.018
spg            0.0102      0.006      1.820      0.069        -0.001     0.021
ps_x           0.0198      0.003      5.674      0.000         0.013     0.027
motm           0.0265      0.006      4.537      0.000         0.015     0.038
aw             0.0636      0.007      9.345      0.000         0.050     0.077
tackles        0.0985      0.006     15.168      0.000         0.086     0.111
inter          0.0741      0.006     13.223      0.000         0.063     0.085
fouls         -0.0288      0.006     -5.060      0.000        -0.040    -0.018
offsides      -0.0173      0.006     -2.901      0.004        -0.029    -0.006
clear          0.0528      0.008      6.648      0.000         0.037     0.068
drb           -0.0172      0.006     -3.072      0.002        -0.028    -0.006
blocks         0.0246      0.006      4.055      0.000         0.013     0.037
owng          -0.0200      0.005     -4.238      0.000        -0.029    -0.011
keyp_x         0.0413      0.007      5.662      0.000         0.027     0.056
fouled         0.0158      0.005      3.040      0.002         0.006     0.026
off            0.0119      0.005      2.505      0.012         0.003     0.021
disp           0.0201      0.007      3.001      0.003         0.007     0.033
unstch         0.0003      0.007      0.046      0.963        -0.014     0.015
avgp           0.0507      0.008      6.381      0.000         0.035     0.066
ps_y           0.0198      0.003      5.674      0.000         0.013     0.027
==============================================================================
Omnibus:                       15.069   Durbin-Watson:                   1.400
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               27.400
Skew:                          -0.012   Prob(JB):                     1.12e-06
Kurtosis:                       3.897   Cond. No.                     9.03e+15
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.75e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
Remove Some Feature

In [6]:
# remove features
remove_column_list = [
    "age", "tall", "weight", "apps_start", "apps_sub", "mins", "yel", "unstch"
]
removed_d_df = d_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_d_df.ix[:, -1], removed_d_df.ix[:, :-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.764
Model:                            OLS   Adj. R-squared:                  0.758
Method:                 Least Squares   F-statistic:                     129.1
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          1.06e-233
Time:                        11:06:24   Log-Likelihood:                 520.29
No. Observations:                 817   AIC:                            -998.6
Df Residuals:                     796   BIC:                            -899.8
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.5091      0.073     75.010      0.000         5.365     5.653
goals          0.0299      0.006      5.442      0.000         0.019     0.041
assists        0.0338      0.006      5.881      0.000         0.023     0.045
red           -0.0269      0.005     -5.724      0.000        -0.036    -0.018
spg            0.0115      0.006      2.088      0.037         0.001     0.022
ps_x           0.0209      0.003      6.153      0.000         0.014     0.028
motm           0.0285      0.006      4.948      0.000         0.017     0.040
aw             0.0672      0.007     10.156      0.000         0.054     0.080
tackles        0.0965      0.006     15.117      0.000         0.084     0.109
inter          0.0712      0.005     13.063      0.000         0.061     0.082
fouls         -0.0308      0.005     -6.002      0.000        -0.041    -0.021
offsides      -0.0184      0.006     -3.114      0.002        -0.030    -0.007
clear          0.0478      0.007      6.551      0.000         0.033     0.062
drb           -0.0171      0.006     -3.051      0.002        -0.028    -0.006
blocks         0.0235      0.006      3.934      0.000         0.012     0.035
owng          -0.0203      0.005     -4.338      0.000        -0.029    -0.011
keyp_x         0.0376      0.007      5.299      0.000         0.024     0.052
fouled         0.0143      0.005      2.826      0.005         0.004     0.024
off            0.0111      0.005      2.359      0.019         0.002     0.020
disp           0.0193      0.006      3.252      0.001         0.008     0.031
avgp           0.0470      0.007      6.387      0.000         0.033     0.061
ps_y           0.0209      0.003      6.153      0.000         0.014     0.028
==============================================================================
Omnibus:                       14.080   Durbin-Watson:                   1.393
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               23.488
Skew:                          -0.084   Prob(JB):                     7.94e-06
Kurtosis:                       3.814   Cond. No.                     2.92e+17
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.91e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
Anova & Remove Feature

In [7]:
formula_str = """
rating ~ goals + assists + red + spg + ps_x + motm + aw
+ tackles + inter + fouls + offsides + clear + drb + blocks
+ owng + keyp_x + fouled + off + disp + avgp + ps_y
"""

model = sm.OLS.from_formula(formula_str, data=removed_d_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova


Out[7]:
df sum_sq mean_sq F PR(>F)
goals 1.0 7.351482 7.351482 437.207013 1.015348e-77
assists 1.0 4.562316 4.562316 271.329858 1.116092e-52
red 1.0 0.018432 0.018432 1.096175 2.954250e-01
spg 1.0 1.880096 1.880096 111.812964 1.524576e-24
ps_x 1.0 2.231002 2.231002 132.682065 1.671015e-28
motm 1.0 7.489474 7.489474 445.413638 7.204196e-79
aw 1.0 4.249837 4.249837 252.746118 1.245196e-49
tackles 1.0 8.439651 8.439651 501.922530 1.403409e-86
inter 1.0 3.188942 3.188942 189.652623 7.383641e-39
fouls 1.0 0.364824 0.364824 21.696824 3.741386e-06
offsides 1.0 0.108970 0.108970 6.480675 1.109283e-02
clear 1.0 0.577817 0.577817 34.363883 6.693984e-09
drb 1.0 0.071945 0.071945 4.278703 3.891519e-02
blocks 1.0 0.122185 0.122185 7.266577 7.173233e-03
owng 1.0 0.329726 0.329726 19.609450 1.082698e-05
keyp_x 1.0 1.273226 1.273226 75.721244 1.859889e-17
fouled 1.0 0.185722 0.185722 11.045265 9.297657e-04
off 1.0 0.090795 0.090795 5.399748 2.039063e-02
disp 1.0 0.183902 0.183902 10.936982 9.849140e-04
avgp 1.0 0.685988 0.685988 40.797051 2.870051e-10
ps_y 1.0 0.016720 0.016720 0.994387 3.189756e-01
Residual 796.0 13.384460 0.016815 NaN NaN

In [8]:
# remove feature 2
remove_column_list = [
    "red", "offsides", "drb", "blocks", "off", "disp", "ps_y"
]
removed2_d_df = removed_d_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_d_df.ix[:, -1], removed2_d_df.ix[:, :-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.740
Model:                            OLS   Adj. R-squared:                  0.736
Method:                 Least Squares   F-statistic:                     163.4
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          1.28e-223
Time:                        11:06:26   Log-Likelihood:                 480.88
No. Observations:                 817   AIC:                            -931.8
Df Residuals:                     802   BIC:                            -861.2
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.4991      0.076     72.405      0.000         5.350     5.648
goals          0.0330      0.006      5.826      0.000         0.022     0.044
assists        0.0320      0.006      5.331      0.000         0.020     0.044
spg            0.0124      0.006      2.158      0.031         0.001     0.024
ps_x           0.0447      0.007      6.347      0.000         0.031     0.059
motm           0.0264      0.006      4.422      0.000         0.015     0.038
aw             0.0684      0.007      9.977      0.000         0.055     0.082
tackles        0.0944      0.006     16.304      0.000         0.083     0.106
inter          0.0667      0.006     11.839      0.000         0.056     0.078
fouls         -0.0341      0.005     -6.418      0.000        -0.045    -0.024
clear          0.0480      0.007      7.289      0.000         0.035     0.061
owng          -0.0200      0.005     -4.123      0.000        -0.030    -0.010
keyp_x         0.0475      0.007      6.701      0.000         0.034     0.061
fouled         0.0171      0.005      3.274      0.001         0.007     0.027
avgp           0.0400      0.008      5.304      0.000         0.025     0.055
==============================================================================
Omnibus:                        9.852   Durbin-Watson:                   1.362
Prob(Omnibus):                  0.007   Jarque-Bera (JB):               15.202
Skew:                          -0.008   Prob(JB):                     0.000500
Kurtosis:                       3.668   Cond. No.                         211.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [9]:
# tackles
# aw : 공중볼 경합
# inter
# clear

In [ ]: