In [1]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
import sqlite3
import statsmodels.api as sm
%matplotlib inline

Read data using sql query


In [2]:
con = sqlite3.connect('../data/nflPPdb.sqlite')
df1 = pd.read_sql_query('SELECT DISTINCT combine.name, rr.year, rr.rushing_yards, rr.receiving_yards, rr.games_played\
                        FROM combine, rr\
                        WHERE combine.name = rr.name AND combine.year < 2009', con)

df2 = pd.read_sql_query('SELECT combine.name, combine.fortyyd, \
                        combine.twentyss, combine.vertical, combine.picktotal, \
                        combine.heightinchestotal, combine.weight \
                        FROM combine\
                        WHERE combine.year < 2009 AND combine.pickround != 0', con)

df2['speedscore'] = (df2['weight']*200)/(df2['fortyyd']**4)
df2['BMI'] = 703.0 * df2['weight'] / df2['heightinchestotal']**2

df3 = pd.merge(df1, df2, on='name', how='inner', suffixes=('df1','df2'))
df4 = pd.read_sql_query('SELECT players.name, players.position\
                        FROM players', con)
df5 = pd.merge(df3,df4, on='name', how='inner', suffixes=('df3','df4'))
df5 = df5.drop_duplicates()
df5['totYds'] = (df5.receiving_yards + df5.rushing_yards)
# choose position of a player
df5 = df5[df5.position.isin(['RB'])]
#df5 = df5[df5.fortyyd < 5] #remove outlier

Multivariable regression

Performance


In [3]:
regdata = df5.groupby('name').head(3).reset_index(drop=True)
# duplicate regdata to get first three year sum
regdata_d = regdata
regdata = regdata.groupby('name').mean()
regdata_d = regdata_d.groupby('name').sum()
regdata['totYds'] = regdata_d['totYds']
regdata = regdata.dropna()

In [4]:
X = regdata[['weight','heightinchestotal','fortyyd','twentyss','vertical']]
Y = regdata['totYds']
X = sm.add_constant(X)
mod = sm.OLS(Y,X)

In [5]:
res = mod.fit()

In [6]:
print res.summary()


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 totYds   R-squared:                       0.232
Model:                            OLS   Adj. R-squared:                  0.197
Method:                 Least Squares   F-statistic:                     6.584
Date:                Wed, 10 Aug 2016   Prob (F-statistic):           2.17e-05
Time:                        16:49:08   Log-Likelihood:                -983.24
No. Observations:                 115   AIC:                             1978.
Df Residuals:                     109   BIC:                             1995.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
=====================================================================================
                        coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------
const              2.911e+04   6751.826      4.311      0.000      1.57e+04  4.25e+04
weight               23.6337      9.617      2.457      0.016         4.572    42.695
heightinchestotal   -28.0337     76.785     -0.365      0.716      -180.220   124.152
fortyyd           -6691.3688   1253.491     -5.338      0.000     -9175.747 -4206.990
twentyss            -59.2034     62.525     -0.947      0.346      -183.127    64.720
vertical             -5.6216     12.268     -0.458      0.648       -29.937    18.694
==============================================================================
Omnibus:                        9.891   Durbin-Watson:                   1.962
Prob(Omnibus):                  0.007   Jarque-Bera (JB):                9.954
Skew:                           0.698   Prob(JB):                      0.00690
Kurtosis:                       3.360   Cond. No.                     1.31e+04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.31e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

In [7]:
for column in X.columns:
    plt.scatter(X[column],Y)
    plt.xlabel(column)
    plt.ylabel('rushing_yards')
    plt.show(block=False)


Total pick number


In [8]:
Y2 = regdata['picktotal']
mod3 = sm.OLS(Y2,X)
res3 = mod3.fit()
print res3.summary()


                            OLS Regression Results                            
==============================================================================
Dep. Variable:              picktotal   R-squared:                       0.287
Model:                            OLS   Adj. R-squared:                  0.254
Method:                 Least Squares   F-statistic:                     8.766
Date:                Wed, 10 Aug 2016   Prob (F-statistic):           5.14e-07
Time:                        16:49:09   Log-Likelihood:                -628.67
No. Observations:                 115   AIC:                             1269.
Df Residuals:                     109   BIC:                             1286.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
=====================================================================================
                        coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------
const             -1251.6341    309.307     -4.047      0.000     -1864.670  -638.598
weight               -1.3976      0.441     -3.172      0.002        -2.271    -0.524
heightinchestotal    -0.0110      3.518     -0.003      0.998        -6.983     6.961
fortyyd             361.8996     57.424      6.302      0.000       248.088   475.711
twentyss              1.2189      2.864      0.426      0.671        -4.458     6.896
vertical              0.4303      0.562      0.766      0.446        -0.684     1.544
==============================================================================
Omnibus:                        9.237   Durbin-Watson:                   2.356
Prob(Omnibus):                  0.010   Jarque-Bera (JB):                9.717
Skew:                           0.712   Prob(JB):                      0.00776
Kurtosis:                       3.017   Cond. No.                     1.31e+04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.31e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

In [9]:
for column in X.columns:
    plt.scatter(X[column],Y2)
    plt.xlabel(column)
    plt.ylabel('Total_pick')
    plt.show(block=False)



In [10]:
X = regdata[['speedscore','BMI']]
Y = regdata['rushing_yards']
X = sm.add_constant(X)
mod2 = sm.OLS(Y,X)

In [11]:
res2 = mod2.fit()

In [12]:
xx1, xx2 = np.meshgrid(np.linspace(regdata['speedscore'].min(), regdata['speedscore'].max(), 100), 
                       np.linspace(regdata['BMI'].min(), regdata['BMI'].max(), 100))

In [13]:
Z = res2.params[0] + res2.params[1] * xx1 + res2.params[2] * xx2

In [14]:
fig = plt.figure(figsize=(12, 8))
ax = Axes3D(fig, azim=-115, elev=15)
surf = ax.plot_surface(xx1, xx2, Z, cmap=plt.cm.RdBu_r, alpha=0.6, linewidth=0)
resid = Y - res2.predict(X)
ax.scatter(X[resid >= 0]['speedscore'], X[resid >= 0]['BMI'], Y[resid >= 0], c='r',
            marker = 'o')
ax.scatter(X[resid < 0]['speedscore'], X[resid < 0]['BMI'], Y[resid < 0], color='black', alpha=1.0)
ax.set_xlabel('speedscore')
ax.set_ylabel('BMI')
ax.set_zlabel('RushingYards')


Out[14]:
<matplotlib.text.Text at 0x11dba3590>