Defencer OLS

Import Package
Connect DB & get Defence Player Data
Scaling
Summary OLS
Remove Feature
Anova & Remove Feature

Import Package



In [2]:

    
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

Connect DB & get Forword Player Data



In [3]:

    
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating        
        FROM player
    """
    
    if position == "F":
        SQL_QUERY += """
            WHERE position not like "%,%" and position like "%FW%" and mins > 270
        """
    
    if position == "M":
        SQL_QUERY += """
            WHERE position not like "%,%" and position like "%M%" and mins > 270
        """
    
    if position == "D":
        SQL_QUERY += """
            WHERE position not like "%,%" and position like "%D%" and position not like " DMC"  and mins > 270
        """
    
    if position == "G":
        SQL_QUERY += """
            WHERE position not like "%,%" and position like "%G%" and mins > 270
        """
    
    return SQL_QUERY

# defencer
SQL_QUERY = make_query("D")
defenser_df = pd.read_sql(SQL_QUERY, db)

len(defenser_df)









    Out[3]:





817

Scaling



In [4]:

    
X = defenser_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(defenser_df.ix[:,-1], columns=["rating"])
d_df = pd.concat([dfX, dfy], axis=1)
d_df.head()









    Out[4]:






  
    
      
      const
      age
      tall
      weight
      apps_start
      apps_sub
      mins
      goals
      assists
      yel
      ...
      blocks
      owng
      keyp_x
      fouled
      off
      disp
      unstch
      avgp
      ps_y
      rating
    
  
  
    
      0
      1
      5.342523
      15.105863
      10.954348
      2.615560
      0.466727
      2.632199
      2.827191
      0.900141
      2.688382
      ...
      1.967895
      2.427616
      0.654447
      2.379510
      0.000000
      0.672855
      1.001993
      4.325854
      11.328504
      6.93
    
    
      1
      1
      6.799575
      14.146761
      9.665601
      3.452539
      0.466727
      3.454909
      0.000000
      2.700424
      2.304327
      ...
      0.327983
      0.000000
      2.290563
      1.665657
      0.000000
      2.018565
      2.337983
      2.718860
      10.961246
      6.90
    
    
      2
      1
      6.556733
      14.466461
      9.923350
      1.569336
      3.733816
      1.738155
      0.000000
      0.000000
      0.384055
      ...
      0.327983
      0.000000
      1.308893
      0.951804
      1.435429
      1.009282
      1.669988
      3.083691
      10.890619
      6.89
    
    
      3
      1
      6.799575
      14.786162
      10.181100
      2.929427
      0.933454
      2.954387
      0.000000
      0.900141
      1.920273
      ...
      2.295878
      2.427616
      0.654447
      0.237951
      1.435429
      0.672855
      1.001993
      3.274793
      11.342629
      6.86
    
    
      4
      1
      7.285259
      14.546387
      9.407851
      3.347916
      0.000000
      3.369309
      0.942397
      3.600565
      4.224600
      ...
      0.655965
      0.000000
      2.945009
      2.141559
      0.000000
      2.691419
      3.339975
      4.508270
      11.653386
      7.60
    
  

5 rows × 31 columns

Summary OLS



In [5]:

    
model = sm.OLS(d_df.ix[:, -1], d_df.ix[:, :-1])
result = model.fit()
print(result.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.767
Model:                            OLS   Adj. R-squared:                  0.759
Method:                 Least Squares   F-statistic:                     92.66
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          1.84e-227
Time:                        11:06:23   Log-Likelihood:                 525.03
No. Observations:                 817   AIC:                            -992.1
Df Residuals:                     788   BIC:                            -855.6
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.4647      0.108     50.706      0.000         5.253     5.676
age           -0.0031      0.005     -0.617      0.538        -0.013     0.007
tall          -0.0080      0.008     -1.015      0.310        -0.023     0.007
weight         0.0149      0.008      1.758      0.079        -0.002     0.032
apps_start     0.1055      0.069      1.537      0.125        -0.029     0.240
apps_sub       0.0161      0.007      2.295      0.022         0.002     0.030
mins          -0.0982      0.068     -1.451      0.147        -0.231     0.035
goals          0.0297      0.006      5.268      0.000         0.019     0.041
assists        0.0319      0.006      5.303      0.000         0.020     0.044
yel           -0.0056      0.006     -0.908      0.364        -0.018     0.007
red           -0.0275      0.005     -5.747      0.000        -0.037    -0.018
spg            0.0102      0.006      1.820      0.069        -0.001     0.021
ps_x           0.0198      0.003      5.674      0.000         0.013     0.027
motm           0.0265      0.006      4.537      0.000         0.015     0.038
aw             0.0636      0.007      9.345      0.000         0.050     0.077
tackles        0.0985      0.006     15.168      0.000         0.086     0.111
inter          0.0741      0.006     13.223      0.000         0.063     0.085
fouls         -0.0288      0.006     -5.060      0.000        -0.040    -0.018
offsides      -0.0173      0.006     -2.901      0.004        -0.029    -0.006
clear          0.0528      0.008      6.648      0.000         0.037     0.068
drb           -0.0172      0.006     -3.072      0.002        -0.028    -0.006
blocks         0.0246      0.006      4.055      0.000         0.013     0.037
owng          -0.0200      0.005     -4.238      0.000        -0.029    -0.011
keyp_x         0.0413      0.007      5.662      0.000         0.027     0.056
fouled         0.0158      0.005      3.040      0.002         0.006     0.026
off            0.0119      0.005      2.505      0.012         0.003     0.021
disp           0.0201      0.007      3.001      0.003         0.007     0.033
unstch         0.0003      0.007      0.046      0.963        -0.014     0.015
avgp           0.0507      0.008      6.381      0.000         0.035     0.066
ps_y           0.0198      0.003      5.674      0.000         0.013     0.027
==============================================================================
Omnibus:                       15.069   Durbin-Watson:                   1.400
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               27.400
Skew:                          -0.012   Prob(JB):                     1.12e-06
Kurtosis:                       3.897   Cond. No.                     9.03e+15
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.75e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

Remove Some Feature



In [6]:

    
# remove features
remove_column_list = [
    "age", "tall", "weight", "apps_start", "apps_sub", "mins", "yel", "unstch"
]
removed_d_df = d_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_d_df.ix[:, -1], removed_d_df.ix[:, :-1])
result = model.fit()
print(result.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.764
Model:                            OLS   Adj. R-squared:                  0.758
Method:                 Least Squares   F-statistic:                     129.1
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          1.06e-233
Time:                        11:06:24   Log-Likelihood:                 520.29
No. Observations:                 817   AIC:                            -998.6
Df Residuals:                     796   BIC:                            -899.8
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.5091      0.073     75.010      0.000         5.365     5.653
goals          0.0299      0.006      5.442      0.000         0.019     0.041
assists        0.0338      0.006      5.881      0.000         0.023     0.045
red           -0.0269      0.005     -5.724      0.000        -0.036    -0.018
spg            0.0115      0.006      2.088      0.037         0.001     0.022
ps_x           0.0209      0.003      6.153      0.000         0.014     0.028
motm           0.0285      0.006      4.948      0.000         0.017     0.040
aw             0.0672      0.007     10.156      0.000         0.054     0.080
tackles        0.0965      0.006     15.117      0.000         0.084     0.109
inter          0.0712      0.005     13.063      0.000         0.061     0.082
fouls         -0.0308      0.005     -6.002      0.000        -0.041    -0.021
offsides      -0.0184      0.006     -3.114      0.002        -0.030    -0.007
clear          0.0478      0.007      6.551      0.000         0.033     0.062
drb           -0.0171      0.006     -3.051      0.002        -0.028    -0.006
blocks         0.0235      0.006      3.934      0.000         0.012     0.035
owng          -0.0203      0.005     -4.338      0.000        -0.029    -0.011
keyp_x         0.0376      0.007      5.299      0.000         0.024     0.052
fouled         0.0143      0.005      2.826      0.005         0.004     0.024
off            0.0111      0.005      2.359      0.019         0.002     0.020
disp           0.0193      0.006      3.252      0.001         0.008     0.031
avgp           0.0470      0.007      6.387      0.000         0.033     0.061
ps_y           0.0209      0.003      6.153      0.000         0.014     0.028
==============================================================================
Omnibus:                       14.080   Durbin-Watson:                   1.393
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               23.488
Skew:                          -0.084   Prob(JB):                     7.94e-06
Kurtosis:                       3.814   Cond. No.                     2.92e+17
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.91e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

Anova & Remove Feature



In [7]:

    
formula_str = """
rating ~ goals + assists + red + spg + ps_x + motm + aw
+ tackles + inter + fouls + offsides + clear + drb + blocks
+ owng + keyp_x + fouled + off + disp + avgp + ps_y
"""

model = sm.OLS.from_formula(formula_str, data=removed_d_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova









    Out[7]:






  
    
      
      df
      sum_sq
      mean_sq
      F
      PR(>F)
    
  
  
    
      goals
      1.0
      7.351482
      7.351482
      437.207013
      1.015348e-77
    
    
      assists
      1.0
      4.562316
      4.562316
      271.329858
      1.116092e-52
    
    
      red
      1.0
      0.018432
      0.018432
      1.096175
      2.954250e-01
    
    
      spg
      1.0
      1.880096
      1.880096
      111.812964
      1.524576e-24
    
    
      ps_x
      1.0
      2.231002
      2.231002
      132.682065
      1.671015e-28
    
    
      motm
      1.0
      7.489474
      7.489474
      445.413638
      7.204196e-79
    
    
      aw
      1.0
      4.249837
      4.249837
      252.746118
      1.245196e-49
    
    
      tackles
      1.0
      8.439651
      8.439651
      501.922530
      1.403409e-86
    
    
      inter
      1.0
      3.188942
      3.188942
      189.652623
      7.383641e-39
    
    
      fouls
      1.0
      0.364824
      0.364824
      21.696824
      3.741386e-06
    
    
      offsides
      1.0
      0.108970
      0.108970
      6.480675
      1.109283e-02
    
    
      clear
      1.0
      0.577817
      0.577817
      34.363883
      6.693984e-09
    
    
      drb
      1.0
      0.071945
      0.071945
      4.278703
      3.891519e-02
    
    
      blocks
      1.0
      0.122185
      0.122185
      7.266577
      7.173233e-03
    
    
      owng
      1.0
      0.329726
      0.329726
      19.609450
      1.082698e-05
    
    
      keyp_x
      1.0
      1.273226
      1.273226
      75.721244
      1.859889e-17
    
    
      fouled
      1.0
      0.185722
      0.185722
      11.045265
      9.297657e-04
    
    
      off
      1.0
      0.090795
      0.090795
      5.399748
      2.039063e-02
    
    
      disp
      1.0
      0.183902
      0.183902
      10.936982
      9.849140e-04
    
    
      avgp
      1.0
      0.685988
      0.685988
      40.797051
      2.870051e-10
    
    
      ps_y
      1.0
      0.016720
      0.016720
      0.994387
      3.189756e-01
    
    
      Residual
      796.0
      13.384460
      0.016815
      NaN
      NaN



In [8]:

    
# remove feature 2
remove_column_list = [
    "red", "offsides", "drb", "blocks", "off", "disp", "ps_y"
]
removed2_d_df = removed_d_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_d_df.ix[:, -1], removed2_d_df.ix[:, :-1])
result = model.fit()
print(result.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.740
Model:                            OLS   Adj. R-squared:                  0.736
Method:                 Least Squares   F-statistic:                     163.4
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          1.28e-223
Time:                        11:06:26   Log-Likelihood:                 480.88
No. Observations:                 817   AIC:                            -931.8
Df Residuals:                     802   BIC:                            -861.2
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.4991      0.076     72.405      0.000         5.350     5.648
goals          0.0330      0.006      5.826      0.000         0.022     0.044
assists        0.0320      0.006      5.331      0.000         0.020     0.044
spg            0.0124      0.006      2.158      0.031         0.001     0.024
ps_x           0.0447      0.007      6.347      0.000         0.031     0.059
motm           0.0264      0.006      4.422      0.000         0.015     0.038
aw             0.0684      0.007      9.977      0.000         0.055     0.082
tackles        0.0944      0.006     16.304      0.000         0.083     0.106
inter          0.0667      0.006     11.839      0.000         0.056     0.078
fouls         -0.0341      0.005     -6.418      0.000        -0.045    -0.024
clear          0.0480      0.007      7.289      0.000         0.035     0.061
owng          -0.0200      0.005     -4.123      0.000        -0.030    -0.010
keyp_x         0.0475      0.007      6.701      0.000         0.034     0.061
fouled         0.0171      0.005      3.274      0.001         0.007     0.027
avgp           0.0400      0.008      5.304      0.000         0.025     0.055
==============================================================================
Omnibus:                        9.852   Durbin-Watson:                   1.362
Prob(Omnibus):                  0.007   Jarque-Bera (JB):               15.202
Skew:                          -0.008   Prob(JB):                     0.000500
Kurtosis:                       3.668   Cond. No.                         211.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [9]:

    
# tackles
# aw : 공중볼 경합
# inter
# clear



In [ ]:

	const	age	tall	weight	apps_start	apps_sub	mins	goals	assists	yel	...	blocks	owng	keyp_x	fouled	off	disp	unstch	avgp	ps_y	rating
0	1	5.342523	15.105863	10.954348	2.615560	0.466727	2.632199	2.827191	0.900141	2.688382	...	1.967895	2.427616	0.654447	2.379510	0.000000	0.672855	1.001993	4.325854	11.328504	6.93
1	1	6.799575	14.146761	9.665601	3.452539	0.466727	3.454909	0.000000	2.700424	2.304327	...	0.327983	0.000000	2.290563	1.665657	0.000000	2.018565	2.337983	2.718860	10.961246	6.90
2	1	6.556733	14.466461	9.923350	1.569336	3.733816	1.738155	0.000000	0.000000	0.384055	...	0.327983	0.000000	1.308893	0.951804	1.435429	1.009282	1.669988	3.083691	10.890619	6.89
3	1	6.799575	14.786162	10.181100	2.929427	0.933454	2.954387	0.000000	0.900141	1.920273	...	2.295878	2.427616	0.654447	0.237951	1.435429	0.672855	1.001993	3.274793	11.342629	6.86
4	1	7.285259	14.546387	9.407851	3.347916	0.000000	3.369309	0.942397	3.600565	4.224600	...	0.655965	0.000000	2.945009	2.141559	0.000000	2.691419	3.339975	4.508270	11.653386	7.60

	df	sum_sq	mean_sq	F	PR(>F)
goals	1.0	7.351482	7.351482	437.207013	1.015348e-77
assists	1.0	4.562316	4.562316	271.329858	1.116092e-52
red	1.0	0.018432	0.018432	1.096175	2.954250e-01
spg	1.0	1.880096	1.880096	111.812964	1.524576e-24
ps_x	1.0	2.231002	2.231002	132.682065	1.671015e-28
motm	1.0	7.489474	7.489474	445.413638	7.204196e-79
aw	1.0	4.249837	4.249837	252.746118	1.245196e-49
tackles	1.0	8.439651	8.439651	501.922530	1.403409e-86
inter	1.0	3.188942	3.188942	189.652623	7.383641e-39
fouls	1.0	0.364824	0.364824	21.696824	3.741386e-06
offsides	1.0	0.108970	0.108970	6.480675	1.109283e-02
clear	1.0	0.577817	0.577817	34.363883	6.693984e-09
drb	1.0	0.071945	0.071945	4.278703	3.891519e-02
blocks	1.0	0.122185	0.122185	7.266577	7.173233e-03
owng	1.0	0.329726	0.329726	19.609450	1.082698e-05
keyp_x	1.0	1.273226	1.273226	75.721244	1.859889e-17
fouled	1.0	0.185722	0.185722	11.045265	9.297657e-04
off	1.0	0.090795	0.090795	5.399748	2.039063e-02
disp	1.0	0.183902	0.183902	10.936982	9.849140e-04
avgp	1.0	0.685988	0.685988	40.797051	2.870051e-10
ps_y	1.0	0.016720	0.016720	0.994387	3.189756e-01
Residual	796.0	13.384460	0.016815	NaN	NaN