A Different Moneyball

By: Matthew Cohen
(Based on the concept of the movie Moneyball)
Hypothesis: Can wins in the NBA be predicted by statistics other than those that are shown on the scoreboard? In other words, disregarding points, can a teams success over a season be accurately predicted?

Bringing in Data and Necessary Packages



In [2]:

    
import sqlite3
import pandas
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn import ensemble
import numpy
from sklearn.feature_selection import RFECV
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import  GridSearchCV
from sklearn import tree
import matplotlib.pyplot as plt

%matplotlib inline

conn = sqlite3.connect('/Users/MatthewCohen/Documents/SQLite/TeamSeason1.sqlite')
query = """SELECT t.won as wins, g.good_team, t.o_fgm as field_goals_made, t.o_fga as field_goals_attempted,
t.o_ftm as free_throws_made, t.o_fta as free_throws_attempted, t.o_oreb as offensive_rebounds,
t.o_dreb as defensive_rebounds, t.o_reb as total_rebounds, t.o_asts as assists, t.o_pf as personal_fouls,
t.o_stl as steals, t.o_to as turnovers, t.o_3pm as three_pointers_made, t.o_3pa as three_pointers_attempted,
t.d_fgm as field_goals_allowed, t.d_fga as field_goal_attempts_allowed, t.d_reb as rebounds_allowed,
t.d_asts as assists_allowed, t.d_pf as fouls_against, t.d_3pm as three_point_makes_allowed,
((o_fgm / o_fga)*100) as field_goal_percentage, ((o_ftm / o_fta)*100) as free_throw_percentage,
((o_3pm / o_3pa)*100) as three_point_percentage, o_blk as blocks, o_pts as points, d_pts as points_against
FROM TeamSeason1 t
LEFT OUTER JOIN Good_Teams2 g ON t.team = g.team and t.year = g.year
WHERE t.year > 1980 and t.year <= 2009;"""
df = pandas.read_sql(query, conn)
conn.close









    Out[2]:





<function close>

Defining Explanatory Features and Response Series



In [3]:

    
explanatory_features = [col for col in df.columns if col not in ['field_goals_made', 'field_goals_allowed', 'good_team', 'wins', 'points', 'points_against', 'free_throws_made', 'three_pointers_made']]
explanatory_df = df[explanatory_features]
explanatory_colnames = explanatory_df.columns

response_series = df.good_team

Checking for Correlation Between Explanatory Variables



In [4]:

    
toChart = explanatory_df.corr()
plt.pcolor(toChart)
plt.yticks(numpy.arange(0.5, len(toChart.index), 1), toChart.index)
plt.xticks(numpy.arange(0.5, len(toChart.columns), 1), toChart.columns, rotation=-90)
plt.colorbar()
plt.show()

Removing any Explanatory Features that are Highly Correlated



In [5]:

    
def find_perfect_corr(df):
    """finds columns that are eother positively or negatively perfectly correlated (with correlations of +1 or -1), and creates a dict 
        that includes which columns to drop so that each remaining column
        is independent
    """  
    corrMatrix = df.corr()
    corrMatrix.loc[:,:] =  numpy.tril(corrMatrix.values, k = -1)
    already_in = set()
    result = []
    for col in corrMatrix:
        perfect_corr = corrMatrix[col][abs(numpy.round(corrMatrix[col],10)) >= 0.9].index.tolist()
        if perfect_corr and col not in already_in:
            already_in.update(set(perfect_corr))
            perfect_corr.append(col)
            result.append(perfect_corr)
    toRemove = []
    for item in result:
        toRemove.append(item[1:(len(item)+1)])
    toRemove = sum(toRemove, [])
    return {'corrGroupings':result, 'toRemove':toRemove}

no_correlation = find_perfect_corr(explanatory_df)
explanatory_df.drop(no_correlation['toRemove'], 1, inplace = True)
print no_correlation









    



{'corrGroupings': [['field_goal_attempts_allowed', 'field_goals_attempted'], ['fouls_against', 'free_throws_attempted']], 'toRemove': ['field_goals_attempted', 'free_throws_attempted']}






    



/Users/MatthewCohen/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:23: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame

Scaling Data



In [6]:

    
scaler = preprocessing.StandardScaler()
scaler.fit(explanatory_df)
explanatory_df = pandas.DataFrame(scaler.transform(explanatory_df), columns = explanatory_df.columns)

Scanning for best # of features to use



In [7]:

    
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn import tree

class RandomForestsWithCoef(ensemble.RandomForestClassifier):
    def fit(self, *args, **kwargs):
        super(ensemble.RandomForestClassifier, self).fit(*args, **kwargs)
        self.coef_ = self.feature_importances_

rfWithCoef = RandomForestsWithCoef(n_estimators= 500)
rfe_cv = RFECV(estimator=rfWithCoef, step=1, cv=10, scoring='roc_auc', verbose = 0)
rfe_cv.fit(explanatory_df, response_series)









    Out[7]:





RFECV(cv=10,
   estimator=RandomForestsWithCoef(bootstrap=True, compute_importances=None,
           criterion='gini', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
           min_samples_split=2, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=None, verbose=0),
   estimator_params={}, loss_func=None, scoring='roc_auc', step=1,
   verbose=0)

Defining New Feature Set



In [8]:

    
explanatory_features = [col for col in df.columns if col not in ['field_goals_made', 'field_goals_allowed', 'good_team', 'wins', 'points', 'points_against', 'free_throws_made', 'three_pointers_made', 'offensive_rebounds']]
explanatory_df = df[explanatory_features]
explanatory_colnames = explanatory_df.columns

Predicting Wins Using Random Forests



In [9]:

    
rf = ensemble.RandomForestClassifier(n_estimators= 500)
roc_scores_rf = cross_val_score(rf, explanatory_df, response_series, cv=10, scoring='roc_auc', n_jobs = -1)
print roc_scores_rf.mean()









    



0.903573327258

Grid Search for Best Parameters



In [21]:

    
trees_range = range(10, 300, 10)
param_grid = dict(n_estimators = trees_range)
grid = GridSearchCV(rf, param_grid, cv=10, scoring='roc_auc', n_jobs = -1)
grid.fit(explanatory_df, response_series)
grid_mean_scores = [result[1] for result in grid.grid_scores_]
plt.figure()
plt.plot(trees_range, grid_mean_scores)









    Out[21]:





[<matplotlib.lines.Line2D at 0x10978c150>]

Displaying Best Score



In [22]:

    
best_rf_est = grid.best_estimator_
print best_rf_est.n_estimators
print grid.best_score_









    



160
0.855148875761

Plotting Sensitivity vs Specificity



In [23]:

    
from sklearn.cross_validation import train_test_split
from sklearn import metrics

xTrain, xTest, yTrain, yTest = train_test_split(
                    explanatory_df, response_series, test_size =  0.3)
rf_probabilities = pandas.DataFrame(best_rf_est.fit(xTrain, yTrain).predict_proba(xTest))
rf_fpr, rf_tpr, thresholds = metrics.roc_curve(yTest, rf_probabilities[1])

plt.figure()
plt.plot(rf_fpr, rf_tpr, color = 'b')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')









    Out[23]:





<matplotlib.text.Text at 0x10dbad8d0>

Predict on the New Data



In [24]:

    
# Making two different groups - TRAINING and HOLDOUT
# TRAINING
conn = sqlite3.connect('/Users/MatthewCohen/Documents/SQLite/TeamSeason1.sqlite')
query = """SELECT t.won as wins, g.good_team, t.o_fgm as field_goals_made, t.o_fga as field_goals_attempted,
t.o_ftm as free_throws_made, t.o_fta as free_throws_attempted, t.o_oreb as offensive_rebounds,
t.o_dreb as defensive_rebounds, t.o_reb as total_rebounds, t.o_asts as assists, t.o_pf as personal_fouls,
t.o_stl as steals, t.o_to as turnovers, t.o_3pm as three_pointers_made, t.o_3pa as three_pointers_attempted,
t.d_fgm as field_goals_allowed, t.d_fga as field_goal_attempts_allowed, t.d_reb as rebounds_allowed,
t.d_asts as assists_allowed, t.d_pf as fouls_against, t.d_3pm as three_point_makes_allowed,
((o_fgm / o_fga)*100) as field_goal_percentage, ((o_ftm / o_fta)*100) as free_throw_percentage,
((o_3pm / o_3pa)*100) as three_point_percentage, o_blk as blocks, o_pts as points, d_pts as points_against
FROM TeamSeason1 t
LEFT OUTER JOIN Good_Teams2 g ON t.team = g.team and t.year = g.year
WHERE t.year > 1980 and t.year < 1999;"""
df = pandas.read_sql(query, conn)
conn.close

# Defining Explanatory Features
#'field_goals_made', 'field_goals_allowed', 'good_team', 'wins', 'points', 'points_against', 'free_throws_made', 'three_pointers_made'
explanatory_features = [col for col in df.columns if col not in ['field_goals_made', 'field_goals_allowed', 'good_team', 'wins', 'points', 'points_against', 'free_throws_made', 'three_pointers_made', 'offensive_rebounds']]
explanatory_df = df[explanatory_features]
explanatory_colnames = explanatory_df.columns

# Defining Response Series
response_series = df.good_team

# Scaling data such that it is normally distributed in order to input into model and improve accuracy
scaler = preprocessing.StandardScaler()
scaler.fit(explanatory_df)
explanatory_df = pandas.DataFrame(scaler.transform(explanatory_df), columns = explanatory_df.columns)

# Predicting wins using Random Forests 
rf = ensemble.RandomForestClassifier(n_estimators= 500)
roc_scores_rf = cross_val_score(rf, explanatory_df, response_series, cv=10, scoring='roc_auc', n_jobs = -1)

# Grid Search for best parameters
trees_range = range(10, 300, 10)
param_grid = dict(n_estimators = trees_range)
grid = GridSearchCV(rf, param_grid, cv=10, scoring='roc_auc', n_jobs = -1)
grid.fit(explanatory_df, response_series)
grid_mean_scores = [result[1] for result in grid.grid_scores_]
plt.figure()
plt.plot(trees_range, grid_mean_scores)


best_rf_est = grid.best_estimator_


# HOLDOUT
conn = sqlite3.connect('/Users/MatthewCohen/Documents/SQLite/TeamSeason1.sqlite')
query2 = """SELECT t.won as wins, g.good_team, t.o_fgm as field_goals_made, t.o_fga as field_goals_attempted,
t.o_ftm as free_throws_made, t.o_fta as free_throws_attempted, t.o_oreb as offensive_rebounds,
t.o_dreb as defensive_rebounds, t.o_reb as total_rebounds, t.o_asts as assists, t.o_pf as personal_fouls,
t.o_stl as steals, t.o_to as turnovers, t.o_3pm as three_pointers_made, t.o_3pa as three_pointers_attempted,
t.d_fgm as field_goals_allowed, t.d_fga as field_goal_attempts_allowed, t.d_reb as rebounds_allowed,
t.d_asts as assists_allowed, t.d_pf as fouls_against, t.d_3pm as three_point_makes_allowed,
((o_fgm / o_fga)*100) as field_goal_percentage, ((o_ftm / o_fta)*100) as free_throw_percentage,
((o_3pm / o_3pa)*100) as three_point_percentage, o_blk as blocks, o_pts as points, d_pts as points_against
FROM TeamSeason1 t
LEFT OUTER JOIN Good_Teams2 g ON t.team = g.team and t.year = g.year
WHERE t.year >= 1999 and t.year <= 2009;"""
df2 = pandas.read_sql(query2, conn)
conn.close

# Defining Explanatory Features
#'field_goals_made', 'field_goals_allowed', 'good_team', 'wins', 'points', 'points_against"
explanatory_features2 = [col for col in df2.columns if col not in ['field_goals_made', 'field_goals_allowed', 'good_team', 'wins', 'points', 'points_against', 'free_throws_made', 'three_pointers_made', 'offensive_rebounds']]
explanatory_df2 = df2[explanatory_features2]
explanatory_colnames2 = explanatory_df2.columns

# Defining Response Series
response_series2 = df2.good_team

# Scaling data such that it is normally distributed in order to input into model and improve accuracy
scaler = preprocessing.StandardScaler()
scaler.fit(explanatory_df2)
explanatory_df2 = pandas.DataFrame(scaler.transform(explanatory_df2), columns = explanatory_df2.columns)

Creating Prediction Object



In [25]:

    
prediction = best_rf_est.predict(explanatory_df2)

Visualize Results



In [26]:

    
import numpy as np
import matplotlib.pyplot as plt
import pylab

y = response_series2.tolist()
yhat = prediction.tolist()

def randomize(mean):
    N = 5
    cov = [[0.02, 0.02], [0, 0.02]]
    x,y = np.random.multivariate_normal(mean, cov, N).T
    plt.scatter(x, y, s=70, alpha=0.03)

for i in range(1,325):
    randomize((y[i], yhat[i]))

Checking for Accuracy



In [27]:

    
diff = (response_series2 - prediction).tolist()
accuracy = (diff.count(0)/len(diff))*100
print accuracy

List of feature importances



In [28]:

    
importances = pandas.DataFrame(grid.best_estimator_.feature_importances_, index = explanatory_df.columns, columns =['importance'])
importances.sort(columns = ['importance'], ascending = False, inplace = True)
print importances









    



                             importance
field_goal_percentage          0.122795
defensive_rebounds             0.108721
rebounds_allowed               0.091865
assists                        0.070202
turnovers                      0.061630
three_point_percentage         0.061299
total_rebounds                 0.051376
blocks                         0.050103
assists_allowed                0.048359
personal_fouls                 0.045823
steals                         0.045672
free_throw_percentage          0.040892
free_throws_attempted          0.038098
three_pointers_attempted       0.036026
field_goals_attempted          0.033915
three_point_makes_allowed      0.033079
field_goal_attempts_allowed    0.030997
fouls_against                  0.029148

Recursive feature elimination



In [29]:

    
#rfWithCoef = RandomForestsWithCoef(n_estimators= 500)
rfe = RFE(estimator=rfWithCoef, n_features_to_select=3, step=1, verbose = 0)
rfe.fit(explanatory_df, response_series)

features_used = explanatory_df.columns[rfe.get_support()]
print features_used









    



Index([u'defensive_rebounds', u'rebounds_allowed', u'field_goal_percentage'], dtype='object')

Run Random Forests on 3 Best Features



In [19]:

    
conn = sqlite3.connect('/Users/MatthewCohen/Documents/SQLite/TeamSeason1.sqlite')
query = """SELECT t.won as wins, g.good_team, t.o_fgm as field_goals_made, t.o_fga as field_goals_attempted,
t.o_ftm as free_throws_made, t.o_fta as free_throws_attempted, t.o_oreb as offensive_rebounds,
t.o_dreb as defensive_rebounds, t.o_reb as total_rebounds, t.o_asts as assists, t.o_pf as personal_fouls,
t.o_stl as steals, t.o_to as turnovers, t.o_3pm as three_pointers_made, t.o_3pa as three_pointers_attempted,
t.d_fgm as field_goals_allowed, t.d_fga as field_goal_attempts_allowed, t.d_reb as rebounds_allowed,
t.d_asts as assists_allowed, t.d_pf as fouls_against, t.d_3pm as three_point_makes_allowed,
((o_fgm / o_fga)*100) as field_goal_percentage, ((o_ftm / o_fta)*100) as free_throw_percentage,
((o_3pm / o_3pa)*100) as three_point_percentage, o_blk as blocks, o_pts as points, d_pts as points_against
FROM TeamSeason1 t
LEFT OUTER JOIN Good_Teams2 g ON t.team = g.team and t.year = g.year
WHERE t.year > 1980 and t.year <= 2009;"""
df = pandas.read_sql(query, conn)
conn.close

# Defining Explanatory Features
#'field_goals_made', 'field_goals_allowed', 'good_team', 'wins', 'points', 'points_against', 'free_throws_made', 'three_pointers_made'
explanatory_features = [col for col in df.columns if col in ['defensive_rebounds', 'field_goal_percentage', 'assists_allowed']]
explanatory_df = df[explanatory_features]
explanatory_colnames = explanatory_df.columns

# Defining Response Series
response_series = df.good_team

# Scaling data such that it is normally distributed in order to input into model and improve accuracy
scaler = preprocessing.StandardScaler()
scaler.fit(explanatory_df)
explanatory_df = pandas.DataFrame(scaler.transform(explanatory_df), columns = explanatory_df.columns)

Predicting wins using Random Forests



In [20]:

    
rf = ensemble.RandomForestClassifier(n_estimators= 500)
roc_scores_rf = cross_val_score(rf, explanatory_df, response_series, cv=10, scoring='roc_auc', n_jobs = -1)
print roc_scores_rf.mean()









    



0.851648059543

Grid Search for best parameters



In [30]:

    
trees_range = range(10, 300, 10)
param_grid = dict(n_estimators = trees_range)
grid = GridSearchCV(rf, param_grid, cv=10, scoring='roc_auc', n_jobs = -1)
grid.fit(explanatory_df, response_series)
grid_mean_scores = [result[1] for result in grid.grid_scores_]
plt.figure()
plt.plot(trees_range, grid_mean_scores)









    Out[30]:





[<matplotlib.lines.Line2D at 0x10df3f210>]

Best Estimator



In [31]:

    
best_rf_est = grid.best_estimator_
print best_rf_est.n_estimators
print grid.best_score_









    



70
0.881351080039

FINDING ACCURACY OF SUBSET OF FEATURES



In [32]:

    
# Making two different groups - TRAINING and HOLDOUT
# TRAINING
conn = sqlite3.connect('/Users/MatthewCohen/Documents/SQLite/TeamSeason1.sqlite')
query = """SELECT t.won as wins, g.good_team, t.o_fgm as field_goals_made, t.o_fga as field_goals_attempted,
t.o_ftm as free_throws_made, t.o_fta as free_throws_attempted, t.o_oreb as offensive_rebounds,
t.o_dreb as defensive_rebounds, t.o_reb as total_rebounds, t.o_asts as assists, t.o_pf as personal_fouls,
t.o_stl as steals, t.o_to as turnovers, t.o_3pm as three_pointers_made, t.o_3pa as three_pointers_attempted,
t.d_fgm as field_goals_allowed, t.d_fga as field_goal_attempts_allowed, t.d_reb as rebounds_allowed,
t.d_asts as assists_allowed, t.d_pf as fouls_against, t.d_3pm as three_point_makes_allowed,
((o_fgm / o_fga)*100) as field_goal_percentage, ((o_ftm / o_fta)*100) as free_throw_percentage,
((o_3pm / o_3pa)*100) as three_point_percentage, o_blk as blocks, o_pts as points, d_pts as points_against
FROM TeamSeason1 t
LEFT OUTER JOIN Good_Teams2 g ON t.team = g.team and t.year = g.year
WHERE t.year > 1980 and t.year < 1999;"""
df = pandas.read_sql(query, conn)
conn.close

# Defining Explanatory Features
#'field_goals_made', 'field_goals_allowed', 'good_team', 'wins', 'points', 'points_against', 'free_throws_made', 'three_pointers_made'
explanatory_features = [col for col in df.columns if col in ['defensive_rebounds', 'assists_allowed', 'field_goal_percentage']]
explanatory_df = df[explanatory_features]
explanatory_colnames = explanatory_df.columns

# Defining Response Series
response_series = df.good_team

# Scaling data such that it is normally distributed in order to input into model and improve accuracy
scaler = preprocessing.StandardScaler()
scaler.fit(explanatory_df)
explanatory_df = pandas.DataFrame(scaler.transform(explanatory_df), columns = explanatory_df.columns)

# Predicting wins using Random Forests 
rf = ensemble.RandomForestClassifier(n_estimators= 500)
roc_scores_rf = cross_val_score(rf, explanatory_df, response_series, cv=10, scoring='roc_auc', n_jobs = -1)

# Grid Search for best parameters
trees_range = range(10, 300, 10)
param_grid = dict(n_estimators = trees_range)
grid = GridSearchCV(rf, param_grid, cv=10, scoring='roc_auc', n_jobs = -1)
grid.fit(explanatory_df, response_series)
grid_mean_scores = [result[1] for result in grid.grid_scores_]
plt.figure()
plt.plot(trees_range, grid_mean_scores)


best_rf_est = grid.best_estimator_


# HOLDOUT
conn = sqlite3.connect('/Users/MatthewCohen/Documents/SQLite/TeamSeason1.sqlite')
query2 = """SELECT t.won as wins, g.good_team, t.o_fgm as field_goals_made, t.o_fga as field_goals_attempted,
t.o_ftm as free_throws_made, t.o_fta as free_throws_attempted, t.o_oreb as offensive_rebounds,
t.o_dreb as defensive_rebounds, t.o_reb as total_rebounds, t.o_asts as assists, t.o_pf as personal_fouls,
t.o_stl as steals, t.o_to as turnovers, t.o_3pm as three_pointers_made, t.o_3pa as three_pointers_attempted,
t.d_fgm as field_goals_allowed, t.d_fga as field_goal_attempts_allowed, t.d_reb as rebounds_allowed,
t.d_asts as assists_allowed, t.d_pf as fouls_against, t.d_3pm as three_point_makes_allowed,
((o_fgm / o_fga)*100) as field_goal_percentage, ((o_ftm / o_fta)*100) as free_throw_percentage,
((o_3pm / o_3pa)*100) as three_point_percentage, o_blk as blocks, o_pts as points, d_pts as points_against
FROM TeamSeason1 t
LEFT OUTER JOIN Good_Teams2 g ON t.team = g.team and t.year = g.year
WHERE t.year >= 1999 and t.year <= 2009;"""
df2 = pandas.read_sql(query2, conn)
conn.close

# Defining Explanatory Features
#'field_goals_made', 'field_goals_allowed', 'good_team', 'wins', 'points', 'points_against"
explanatory_features2 = [col for col in df2.columns if col in ['defensive_rebounds', 'assists_allowed', 'field_goal_percentage']]
explanatory_df2 = df2[explanatory_features2]
explanatory_colnames2 = explanatory_df2.columns

# Defining Response Series
response_series2 = df2.good_team

# Scaling data such that it is normally distributed in order to input into model and improve accuracy
scaler = preprocessing.StandardScaler()
scaler.fit(explanatory_df2)
explanatory_df2 = pandas.DataFrame(scaler.transform(explanatory_df2), columns = explanatory_df2.columns)


prediction = best_rf_est.predict(explanatory_df2)

Visualize Results



In [33]:

    
import numpy as np
import matplotlib.pyplot as plt
import pylab

y = response_series2.tolist()
yhat = prediction.tolist()

def randomize(mean):
    N = 5
    cov = [[0.02, 0.02], [0, 0.02]]
    x,y = np.random.multivariate_normal(mean, cov, N).T
    plt.scatter(x, y, s=70, alpha=0.03)
#    return x,y

for i in range(1,325):
    randomize((y[i], yhat[i]))
 #   plt.scatter(x, y, s=70, alpha=0.03)

Accuracy



In [35]:

    
diff = (response_series2 - prediction).tolist()
accuracy = (diff.count(0)/len(diff))*100
print accuracy



In [ ]: