Forword OLS

Import Package
Connect DB & get Forword Player Data
Scaling
Summary OLS
Remove Feature

Import Package



In [25]:

    
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

Connect DB & get Forword Player Data



In [26]:

    
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
        FROM player
        WHERE position like "%{position}%" and position not like "%D%" and mins > 270
        ;
    """.format(position=position)
    
    return SQL_QUERY

# forword
SQL_QUERY = make_query("F")
forword_df = pd.read_sql(SQL_QUERY, db)

len(forword_df)









    Out[26]:





611

Scaling



In [27]:

    
X = forword_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(forword_df.ix[:,-1], columns=["rating"])
f_df = pd.concat([dfX, dfy], axis=1)
f_df.head()









    Out[27]:






  
    
      
      const
      age
      tall
      weight
      apps_start
      apps_sub
      mins
      goals
      assists
      yel
      ...
      blocks
      owng
      keyp_x
      fouled
      off
      disp
      unstch
      avgp
      ps_y
      rating
    
  
  
    
      0
      1
      8.382932
      9.541107
      6.649772
      2.985339
      0.820926
      3.276870
      3.344336
      2.398933
      3.995343
      ...
      1.313059
      0.0
      2.015096
      3.009618
      3.162057
      3.701969
      3.353454
      2.286773
      8.088244
      7.37
    
    
      1
      1
      6.945858
      9.699247
      7.181753
      2.686805
      0.615694
      2.803518
      1.170518
      2.398933
      4.439270
      ...
      1.313059
      0.0
      1.612077
      3.762023
      1.149839
      2.005233
      2.624442
      3.353934
      8.501999
      7.25
    
    
      2
      1
      6.945858
      9.699247
      7.093090
      0.298534
      1.231389
      0.341144
      0.000000
      0.000000
      0.000000
      ...
      2.626118
      0.0
      0.403019
      0.451443
      0.000000
      1.388238
      0.874814
      1.088940
      9.783304
      6.22
    
    
      3
      1
      5.987809
      9.224827
      6.383781
      3.582407
      0.410463
      3.609751
      3.678769
      1.999111
      2.219635
      ...
      1.313059
      0.0
      2.821134
      1.504809
      1.149839
      2.005233
      2.624442
      3.887515
      10.450651
      7.50
    
    
      4
      1
      7.664395
      9.751960
      7.004426
      1.691692
      2.668009
      1.997285
      1.839385
      1.599289
      1.331781
      ...
      1.313059
      0.0
      1.410567
      1.805771
      1.437298
      2.159482
      2.332838
      1.295838
      8.795631
      6.98
    
  

5 rows × 31 columns

Summary OLS



In [28]:

    
model = sm.OLS(f_df.ix[:, -1], f_df.ix[:, :-1])
result = model.fit()
print(result.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.919
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     234.8
Date:                Sun, 26 Jun 2016   Prob (F-statistic):          5.79e-296
Time:                        17:27:52   Log-Likelihood:                 461.08
No. Observations:                 611   AIC:                            -864.2
Df Residuals:                     582   BIC:                            -736.1
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6388      0.091     62.193      0.000         5.461     5.817
age           -0.0091      0.005     -1.700      0.090        -0.020     0.001
tall           0.0084      0.008      1.100      0.272        -0.007     0.023
weight         0.0038      0.008      0.474      0.635        -0.012     0.020
apps_start     0.0661      0.049      1.357      0.175        -0.030     0.162
apps_sub   -8.385e-05      0.009     -0.010      0.992        -0.017     0.017
mins          -0.1498      0.049     -3.037      0.002        -0.247    -0.053
goals          0.1742      0.011     15.252      0.000         0.152     0.197
assists        0.0666      0.008      8.427      0.000         0.051     0.082
yel           -0.0201      0.006     -3.101      0.002        -0.033    -0.007
red           -0.0062      0.005     -1.249      0.212        -0.016     0.004
spg            0.0837      0.009      9.308      0.000         0.066     0.101
ps_x           0.0165      0.003      4.818      0.000         0.010     0.023
motm           0.0212      0.009      2.400      0.017         0.004     0.039
aw             0.0829      0.008     10.207      0.000         0.067     0.099
tackles        0.0656      0.007      8.841      0.000         0.051     0.080
inter          0.0489      0.007      6.859      0.000         0.035     0.063
fouls         -0.0192      0.007     -2.778      0.006        -0.033    -0.006
offsides      -0.0112      0.005     -2.212      0.027        -0.021    -0.001
clear          0.0051      0.007      0.758      0.449        -0.008     0.018
drb           -0.0179      0.006     -2.772      0.006        -0.031    -0.005
blocks         0.0056      0.005      1.038      0.300        -0.005     0.016
owng          -0.0046      0.005     -0.917      0.360        -0.014     0.005
keyp_x         0.0994      0.008     12.079      0.000         0.083     0.116
fouled         0.0396      0.007      5.871      0.000         0.026     0.053
off           -0.0101      0.006     -1.615      0.107        -0.022     0.002
disp           0.0097      0.007      1.341      0.180        -0.005     0.024
unstch        -0.0047      0.008     -0.627      0.531        -0.020     0.010
avgp           0.0326      0.010      3.383      0.001         0.014     0.052
ps_y           0.0165      0.003      4.818      0.000         0.010     0.023
==============================================================================
Omnibus:                       44.878   Durbin-Watson:                   1.984
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              105.265
Skew:                           0.398   Prob(JB):                     1.39e-23
Kurtosis:                       4.871   Cond. No.                     9.04e+15
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.07e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

Remove Some Feature



In [37]:

    
# remove features
remove_column_list = [
    "age", "tall", "weight", "apps_start", "apps_sub", "red", "clear", "blocks", "owng", "unstch", "offsides", "disp", "off"
]
removed_f_df = f_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_f_df.ix[:, -1], removed_f_df.ix[:, :-1])
result = model.fit()
print(result.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.915
Model:                            OLS   Adj. R-squared:                  0.913
Method:                 Least Squares   F-statistic:                     429.4
Date:                Sun, 26 Jun 2016   Prob (F-statistic):          2.74e-307
Time:                        17:33:18   Log-Likelihood:                 449.12
No. Observations:                 611   AIC:                            -866.2
Df Residuals:                     595   BIC:                            -795.6
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6896      0.067     84.360      0.000         5.557     5.822
mins          -0.0836      0.009     -8.811      0.000        -0.102    -0.065
goals          0.1689      0.011     15.172      0.000         0.147     0.191
assists        0.0665      0.008      8.568      0.000         0.051     0.082
yel           -0.0228      0.006     -3.603      0.000        -0.035    -0.010
spg            0.0836      0.009      9.704      0.000         0.067     0.100
ps_x           0.0156      0.003      4.572      0.000         0.009     0.022
motm           0.0235      0.009      2.711      0.007         0.006     0.041
aw             0.0802      0.007     11.491      0.000         0.066     0.094
tackles        0.0691      0.007      9.406      0.000         0.055     0.084
inter          0.0469      0.007      7.214      0.000         0.034     0.060
fouls         -0.0153      0.007     -2.338      0.020        -0.028    -0.002
drb           -0.0165      0.006     -2.563      0.011        -0.029    -0.004
keyp_x         0.1012      0.008     12.456      0.000         0.085     0.117
fouled         0.0385      0.006      6.022      0.000         0.026     0.051
avgp           0.0347      0.009      3.755      0.000         0.017     0.053
ps_y           0.0156      0.003      4.572      0.000         0.009     0.022
==============================================================================
Omnibus:                       45.982   Durbin-Watson:                   1.973
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              104.109
Skew:                           0.421   Prob(JB):                     2.47e-23
Kurtosis:                       4.839   Cond. No.                     4.26e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 7.4e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.



In [39]:

    
formula_str = """
rating ~ mins + goals + assists + yel + spg + ps_x
+ motm + aw + tackles + inter + fouls + drb + keyp_x
+ fouled + avgp + ps_y
"""

model = sm.OLS.from_formula(formula_str, data=removed_f_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova









    Out[39]:






  
    
      
      df
      sum_sq
      mean_sq
      F
      PR(>F)
    
  
  
    
      mins
      1.0
      44.227823
      44.227823
      3199.652899
      1.467266e-241
    
    
      goals
      1.0
      14.209164
      14.209164
      1027.959101
      9.237160e-132
    
    
      assists
      1.0
      7.687434
      7.687434
      556.145853
      2.536368e-87
    
    
      yel
      1.0
      0.116096
      0.116096
      8.398942
      3.892666e-03
    
    
      spg
      1.0
      5.702216
      5.702216
      412.525675
      4.544223e-70
    
    
      ps_x
      1.0
      0.716860
      0.716860
      51.861086
      1.807262e-12
    
    
      motm
      1.0
      2.835558
      2.835558
      205.137890
      3.429509e-40
    
    
      aw
      1.0
      2.351298
      2.351298
      170.104197
      2.240872e-34
    
    
      tackles
      1.0
      5.572391
      5.572391
      403.133490
      7.421342e-69
    
    
      inter
      1.0
      1.060663
      1.060663
      76.733450
      2.031578e-17
    
    
      fouls
      1.0
      0.004059
      0.004059
      0.293674
      5.880784e-01
    
    
      drb
      1.0
      0.007110
      0.007110
      0.514381
      4.735300e-01
    
    
      keyp_x
      1.0
      3.789535
      3.789535
      274.153114
      6.326110e-51
    
    
      fouled
      1.0
      0.551940
      0.551940
      39.930013
      5.162150e-10
    
    
      avgp
      1.0
      0.194879
      0.194879
      14.098477
      1.905189e-04
    
    
      ps_y
      1.0
      0.008867
      0.008867
      0.641490
      4.234913e-01
    
    
      Residual
      595.0
      8.224503
      0.013823
      NaN
      NaN



In [40]:

    
# remove feature 2
remove_column_list = [
    "fouls", "drb", "ps_y"
]
removed2_f_df = removed_f_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_f_df.ix[:, -1], removed2_f_df.ix[:, :-1])
result = model.fit()
print(result.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.914
Model:                            OLS   Adj. R-squared:                  0.912
Method:                 Least Squares   F-statistic:                     486.5
Date:                Sun, 26 Jun 2016   Prob (F-statistic):          1.93e-307
Time:                        17:33:54   Log-Likelihood:                 443.11
No. Observations:                 611   AIC:                            -858.2
Df Residuals:                     597   BIC:                            -796.4
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6694      0.068     83.946      0.000         5.537     5.802
mins          -0.0838      0.010     -8.803      0.000        -0.102    -0.065
goals          0.1726      0.011     15.448      0.000         0.151     0.195
assists        0.0662      0.008      8.461      0.000         0.051     0.082
yel           -0.0280      0.006     -4.658      0.000        -0.040    -0.016
spg            0.0801      0.008      9.458      0.000         0.063     0.097
ps_x           0.0319      0.007      4.640      0.000         0.018     0.045
motm           0.0248      0.009      2.841      0.005         0.008     0.042
aw             0.0769      0.007     11.273      0.000         0.063     0.090
tackles        0.0593      0.007      8.766      0.000         0.046     0.073
inter          0.0452      0.007      6.918      0.000         0.032     0.058
keyp_x         0.1016      0.008     12.432      0.000         0.086     0.118
fouled         0.0331      0.006      5.391      0.000         0.021     0.045
avgp           0.0329      0.009      3.613      0.000         0.015     0.051
==============================================================================
Omnibus:                       47.274   Durbin-Watson:                   1.982
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              120.209
Skew:                           0.394   Prob(JB):                     7.89e-27
Kurtosis:                       5.025   Cond. No.                         155.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [1]:

    
# forword
# goals
# keyp_x : 키 패스
# spg : 게임당 슈팅수
# assists
# aw : 공중볼 경합승리
# fouled : 파울당한 횟수

	const	age	tall	weight	apps_start	apps_sub	mins	goals	assists	yel	...	blocks	keyp_x	fouled	off	disp	unstch	avgp	ps_y	rating
0	1	8.382932	9.541107	6.649772	2.985339	0.820926	3.276870	3.344336	2.398933	3.995343	...	1.313059	2.015096	3.009618	3.162057	3.701969	3.353454	2.286773	8.088244	7.37
1	1	6.945858	9.699247	7.181753	2.686805	0.615694	2.803518	1.170518	2.398933	4.439270	...	1.313059	1.612077	3.762023	1.149839	2.005233	2.624442	3.353934	8.501999	7.25
2	1	6.945858	9.699247	7.093090	0.298534	1.231389	0.341144	0.000000	0.000000	0.000000	...	2.626118	0.403019	0.451443	0.000000	1.388238	0.874814	1.088940	9.783304	6.22
3	1	5.987809	9.224827	6.383781	3.582407	0.410463	3.609751	3.678769	1.999111	2.219635	...	1.313059	2.821134	1.504809	1.149839	2.005233	2.624442	3.887515	10.450651	7.50
4	1	7.664395	9.751960	7.004426	1.691692	2.668009	1.997285	1.839385	1.599289	1.331781	...	1.313059	1.410567	1.805771	1.437298	2.159482	2.332838	1.295838	8.795631	6.98

	df	sum_sq	mean_sq	F	PR(>F)
mins	1.0	44.227823	44.227823	3199.652899	1.467266e-241
goals	1.0	14.209164	14.209164	1027.959101	9.237160e-132
assists	1.0	7.687434	7.687434	556.145853	2.536368e-87
yel	1.0	0.116096	0.116096	8.398942	3.892666e-03
spg	1.0	5.702216	5.702216	412.525675	4.544223e-70
ps_x	1.0	0.716860	0.716860	51.861086	1.807262e-12
motm	1.0	2.835558	2.835558	205.137890	3.429509e-40
aw	1.0	2.351298	2.351298	170.104197	2.240872e-34
tackles	1.0	5.572391	5.572391	403.133490	7.421342e-69
inter	1.0	1.060663	1.060663	76.733450	2.031578e-17
fouls	1.0	0.004059	0.004059	0.293674	5.880784e-01
drb	1.0	0.007110	0.007110	0.514381	4.735300e-01
keyp_x	1.0	3.789535	3.789535	274.153114	6.326110e-51
fouled	1.0	0.551940	0.551940	39.930013	5.162150e-10
avgp	1.0	0.194879	0.194879	14.098477	1.905189e-04
ps_y	1.0	0.008867	0.008867	0.641490	4.234913e-01
Residual	595.0	8.224503	0.013823	NaN	NaN