Midfilder OLS

  • Import Package
  • Connect DB & get Forword Player Data
  • Scaling
  • Summary OLS
  • Remove Feature
  • Anova & Remove Feature

Import Package


In [1]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

Connect DB & get Forword Player Data


In [11]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
        FROM player
        WHERE position like "%M%" and mins > 270
        ;
    """.format(position=position)
    
    return SQL_QUERY

# midfilder
SQL_QUERY = make_query("M")
midfilder_df = pd.read_sql(SQL_QUERY, db)

len(midfilder_df)


Out[11]:
1582
Scaling

In [12]:
X = midfilder_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(midfilder_df.ix[:,-1], columns=["rating"])
m_df = pd.concat([dfX, dfy], axis=1)
m_df.head()


Out[12]:
const age tall weight apps_start apps_sub mins goals assists yel ... blocks owng keyp_x fouled off disp unstch avgp ps_y rating
0 1 7.318647 14.800800 10.561215 2.780837 0.704054 2.879113 1.876429 2.456496 3.478088 ... 0.639768 0.0 1.436757 4.101984 1.808584 2.168207 3.033856 2.165255 9.363235 7.25
1 1 6.813913 14.961678 10.039673 3.089819 0.938738 3.262186 0.536123 1.228248 2.086853 ... 2.559071 0.0 0.718378 0.984476 0.000000 1.000711 1.179833 3.107282 11.568079 7.05
2 1 7.318647 14.076848 9.387747 3.295807 0.938738 3.515548 0.268061 2.047080 2.782470 ... 0.639768 0.0 3.232703 2.461190 0.000000 1.834636 1.011285 3.704836 11.950252 7.03
3 1 6.813913 14.479043 9.909288 3.398801 0.234685 3.498576 0.268061 1.228248 2.086853 ... 0.639768 0.0 1.257162 0.984476 0.452146 1.167496 1.348380 2.404277 10.377463 7.02
4 1 7.066280 14.398604 8.735820 2.265867 1.408107 2.329960 0.804184 3.275329 1.043426 ... 0.000000 0.0 2.693919 1.804873 1.356438 2.501777 2.022571 1.638001 11.303498 6.96

5 rows × 31 columns

Summary OLS

In [13]:
model = sm.OLS(m_df.ix[:, -1], m_df.ix[:, :-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.860
Model:                            OLS   Adj. R-squared:                  0.857
Method:                 Least Squares   F-statistic:                     340.7
Date:                Sun, 26 Jun 2016   Prob (F-statistic):               0.00
Time:                        18:42:38   Log-Likelihood:                 1058.1
No. Observations:                1582   AIC:                            -2058.
Df Residuals:                    1553   BIC:                            -1903.
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6114      0.075     74.415      0.000         5.463     5.759
age           -0.0140      0.003     -4.081      0.000        -0.021    -0.007
tall           0.0050      0.005      0.981      0.327        -0.005     0.015
weight        -0.0039      0.005     -0.727      0.467        -0.014     0.007
apps_start     0.0264      0.034      0.787      0.431        -0.039     0.092
apps_sub       0.0121      0.006      2.129      0.033         0.001     0.023
mins          -0.0553      0.033     -1.667      0.096        -0.120     0.010
goals          0.0947      0.006     15.582      0.000         0.083     0.107
assists        0.0506      0.005      9.745      0.000         0.040     0.061
yel           -0.0126      0.005     -2.708      0.007        -0.022    -0.003
red           -0.0150      0.003     -4.572      0.000        -0.021    -0.009
spg            0.0538      0.006      9.437      0.000         0.043     0.065
ps_x           0.0184      0.002      7.935      0.000         0.014     0.023
motm           0.0373      0.005      7.454      0.000         0.027     0.047
aw             0.0504      0.004     11.764      0.000         0.042     0.059
tackles        0.0983      0.006     17.514      0.000         0.087     0.109
inter          0.0818      0.006     13.377      0.000         0.070     0.094
fouls         -0.0251      0.005     -5.433      0.000        -0.034    -0.016
offsides      -0.0015      0.004     -0.333      0.739        -0.010     0.007
clear          0.0363      0.006      5.877      0.000         0.024     0.048
drb           -0.0168      0.004     -3.780      0.000        -0.025    -0.008
blocks         0.0047      0.005      1.025      0.306        -0.004     0.014
owng          -0.0063      0.003     -1.937      0.053        -0.013  7.98e-05
keyp_x         0.0914      0.005     16.668      0.000         0.081     0.102
fouled         0.0378      0.004      9.411      0.000         0.030     0.046
off            0.0017      0.004      0.380      0.704        -0.007     0.010
disp           0.0031      0.005      0.579      0.563        -0.007     0.013
unstch         0.0072      0.006      1.291      0.197        -0.004     0.018
avgp           0.0509      0.006      7.876      0.000         0.038     0.064
ps_y           0.0184      0.002      7.935      0.000         0.014     0.023
==============================================================================
Omnibus:                       78.970   Durbin-Watson:                   1.579
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              153.086
Skew:                           0.350   Prob(JB):                     5.73e-34
Kurtosis:                       4.353   Cond. No.                     9.03e+15
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.27e-26. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
Remove Some Feature

In [15]:
# remove features
remove_column_list = [
    "tall", "weight", "apps_start", "apps_sub", "mins", "offsides", "blocks", "owng", "off", "disp", "unstch"
]
removed_m_df = m_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_m_df.ix[:, -1], removed_m_df.ix[:, :-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.857
Model:                            OLS   Adj. R-squared:                  0.855
Method:                 Least Squares   F-statistic:                     549.8
Date:                Sun, 26 Jun 2016   Prob (F-statistic):               0.00
Time:                        18:43:38   Log-Likelihood:                 1039.5
No. Observations:                1582   AIC:                            -2043.
Df Residuals:                    1564   BIC:                            -1946.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6499      0.058     96.830      0.000         5.535     5.764
age           -0.0147      0.003     -4.448      0.000        -0.021    -0.008
goals          0.0848      0.006     15.251      0.000         0.074     0.096
assists        0.0456      0.005      9.051      0.000         0.036     0.055
yel           -0.0218      0.004     -5.315      0.000        -0.030    -0.014
red           -0.0153      0.003     -4.637      0.000        -0.022    -0.009
spg            0.0553      0.006      9.962      0.000         0.044     0.066
ps_x           0.0202      0.002      8.829      0.000         0.016     0.025
motm           0.0378      0.005      7.588      0.000         0.028     0.048
aw             0.0502      0.004     11.930      0.000         0.042     0.058
tackles        0.0946      0.006     16.981      0.000         0.084     0.106
inter          0.0762      0.006     13.083      0.000         0.065     0.088
fouls         -0.0203      0.004     -4.684      0.000        -0.029    -0.012
clear          0.0295      0.005      6.195      0.000         0.020     0.039
drb           -0.0167      0.004     -3.810      0.000        -0.025    -0.008
keyp_x         0.0901      0.005     16.726      0.000         0.080     0.101
fouled         0.0372      0.004      9.913      0.000         0.030     0.045
avgp           0.0386      0.006      6.446      0.000         0.027     0.050
ps_y           0.0202      0.002      8.829      0.000         0.016     0.025
==============================================================================
Omnibus:                       87.705   Durbin-Watson:                   1.554
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              158.476
Skew:                           0.409   Prob(JB):                     3.87e-35
Kurtosis:                       4.318   Cond. No.                     7.10e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.07e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
Anova & Remove Feature

In [16]:
formula_str = """
rating ~ age + goals + assists + yel + red
+ spg + ps_x + motm + aw + tackles + inter + fouls + clear + drb
+ keyp_x + fouled + avgp + ps_y
"""

model = sm.OLS.from_formula(formula_str, data=removed_m_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova


Out[16]:
df sum_sq mean_sq F PR(>F)
age 1.0 0.011797 0.011797 0.741286 3.893811e-01
goals 1.0 46.463072 46.463072 2919.695676 0.000000e+00
assists 1.0 22.784540 22.784540 1431.759020 5.364498e-223
yel 1.0 9.644631 9.644631 606.059511 2.255951e-113
red 1.0 0.105741 0.105741 6.644657 1.003618e-02
spg 1.0 0.848527 0.848527 53.320646 4.492423e-13
ps_x 1.0 8.528253 8.528253 535.907372 3.412961e-102
motm 1.0 12.238690 12.238690 769.067724 5.237508e-138
aw 1.0 12.746860 12.746860 801.000648 1.248535e-142
tackles 1.0 18.915697 18.915697 1188.644595 3.127980e-194
inter 1.0 5.812283 5.812283 365.238389 2.434136e-73
fouls 1.0 0.194598 0.194598 12.228348 4.838498e-04
clear 1.0 0.353440 0.353440 22.209836 2.661141e-06
drb 1.0 0.000233 0.000233 0.014628 9.037490e-01
keyp_x 1.0 7.742064 7.742064 486.503997 4.309083e-94
fouled 1.0 1.680731 1.680731 105.615527 5.096085e-24
avgp 1.0 0.661278 0.661278 41.554095 1.523403e-10
ps_y 1.0 0.000318 0.000318 0.019995 8.875691e-01
Residual 1564.0 24.888979 0.015914 NaN NaN

In [17]:
# remove feature 2
remove_column_list = [
    "age", "red", "drb", "ps_y"
]
removed2_m_df = removed_m_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_m_df.ix[:, -1], removed2_m_df.ix[:, :-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 rating   R-squared:                       0.852
Model:                            OLS   Adj. R-squared:                  0.850
Method:                 Least Squares   F-statistic:                     642.1
Date:                Sun, 26 Jun 2016   Prob (F-statistic):               0.00
Time:                        18:44:43   Log-Likelihood:                 1011.9
No. Observations:                1582   AIC:                            -1994.
Df Residuals:                    1567   BIC:                            -1913.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.5220      0.053    105.055      0.000         5.419     5.625
goals          0.0826      0.006     14.640      0.000         0.072     0.094
assists        0.0479      0.005      9.381      0.000         0.038     0.058
yel           -0.0264      0.004     -6.434      0.000        -0.035    -0.018
spg            0.0591      0.006     10.564      0.000         0.048     0.070
ps_x           0.0431      0.005      9.366      0.000         0.034     0.052
motm           0.0384      0.005      7.602      0.000         0.028     0.048
aw             0.0501      0.004     11.738      0.000         0.042     0.058
tackles        0.0871      0.005     17.229      0.000         0.077     0.097
inter          0.0760      0.006     12.849      0.000         0.064     0.088
fouls         -0.0230      0.004     -5.286      0.000        -0.032    -0.014
clear          0.0314      0.005      6.564      0.000         0.022     0.041
keyp_x         0.0867      0.005     15.905      0.000         0.076     0.097
fouled         0.0380      0.004      9.959      0.000         0.030     0.045
avgp           0.0315      0.006      5.303      0.000         0.020     0.043
==============================================================================
Omnibus:                       91.497   Durbin-Watson:                   1.581
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              175.863
Skew:                           0.404   Prob(JB):                     6.48e-39
Kurtosis:                       4.419   Cond. No.                         209.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [ ]:
# midfilder
# tackles
# keyp_x
# goals
# inter
# spg