In [0]:
# packages
import pandas as pd
import numpy as np
import datetime
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    import sklearn

# required machine learning packages
from sklearn import model_selection
from sklearn.feature_selection import RFE
from sklearn.metrics import brier_score_loss, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV as CCV

from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
import xgboost as xgb

In [0]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# loading CSV files

data_downloaded = drive.CreateFile({'id': '1bC5TaScWuEGhK1LPyti24xbQ1XGXBhNY'})
data_downloaded.GetContentFile('nfl_elo.csv')
data_downloaded = drive.CreateFile({'id': '1LBnfAOgwEdR3MqDTw5jdhLXmCx6cquzr'})
data_downloaded.GetContentFile('nfl_teams.csv')
data_downloaded = drive.CreateFile({'id': '1Qv02AygOdZvOhGWJwOsrsT550suhffoV'})
data_downloaded.GetContentFile('spreadspoke_scores.csv')


df = pd.read_csv("spreadspoke_scores.csv")
teams = pd.read_csv("nfl_teams.csv")
games_elo = pd.read_csv("nfl_elo.csv")

In [0]:
# replacing blank strings with NaN
df = df.replace(r'^\s*$', np.nan, regex=True)

# removing rows from specific columns that have null values, resetting index and changing data types
df = df[(df.score_home.isnull() == False) & (df.team_favorite_id.isnull() == False) & (df.over_under_line.isnull() == False) &
        (df.schedule_season >= 1979)]

df.reset_index(drop=True, inplace=True)
df['over_under_line'] = df.over_under_line.astype(float)

# mapping team_id to the correct teams
df['team_home'] = df.team_home.map(teams.set_index('team_name')['team_id'].to_dict())
df['team_away'] = df.team_away.map(teams.set_index('team_name')['team_id'].to_dict())

# fix team_favorite_id for Colts in 1969 and 1971 SB
df.loc[(df.schedule_season == 1968) & (df.schedule_week == 'Superbowl'), 'team_favorite_id'] = 'IND'
df.loc[(df.schedule_season == 1970) & (df.schedule_week == 'Superbowl'), 'team_favorite_id'] = 'IND'

# creating home favorite and away favorite columns (fill na with 0's)
df.loc[df.team_favorite_id == df.team_home, 'home_favorite'] = 1
df.loc[df.team_favorite_id == df.team_away, 'away_favorite'] = 1
df.home_favorite.fillna(0, inplace=True)
df.away_favorite.fillna(0, inplace=True)

# creating over / under column (fill na with 0's)
df.loc[((df.score_home + df.score_away) > df.over_under_line), 'over'] = 1
df.over.fillna(0, inplace=True)

# stadium neutral and schedule playoff as boolean
df['stadium_neutral'] = df.stadium_neutral.astype(int)
df['schedule_playoff'] = df.schedule_playoff.astype(int)

# change data type of date columns
df['schedule_date'] = pd.to_datetime(df['schedule_date'])
games_elo['date'] = pd.to_datetime(games_elo['date'])

In [0]:
# fixing some schedule_week column errors and converting column to integer data type
df.loc[(df.schedule_week == '18'), 'schedule_week'] = '17'
df.loc[(df.schedule_week == 'Wildcard') | (df.schedule_week == 'WildCard'), 'schedule_week'] = '18'
df.loc[(df.schedule_week == 'Division'), 'schedule_week'] = '19'
df.loc[(df.schedule_week == 'Conference'), 'schedule_week'] = '20'
df.loc[(df.schedule_week == 'Superbowl') | (df.schedule_week == 'SuperBowl'), 'schedule_week'] = '21'
df['schedule_week'] = df.schedule_week.astype(int)

In [0]:
# removing extra columns that aren't necessary for analysis
df = df[['schedule_date', 'schedule_season', 'schedule_week', 'team_home',
       'team_away', 'team_favorite_id', 'spread_favorite',
       'over_under_line', 'weather_temperature',
       'weather_wind_mph', 'score_home', 'score_away',
       'stadium_neutral', 'home_favorite', 'away_favorite',
       'over']]

In [0]:
# Cleaning games_elo and df to merge correctly
wsh_map = {'WSH' : 'WAS'}
games_elo.loc[games_elo.team1 == 'WSH', 'team1'] = 'WAS' 
games_elo.loc[games_elo.team2 == 'WSH', 'team2'] = 'WAS'

# fix dates
df.loc[(df.schedule_date == '2016-09-19') & (df.team_home == 'MIN'), 'schedule_date'] = datetime.datetime(2016, 9, 18)
df.loc[(df.schedule_date == '2017-01-22') & (df.schedule_week == 21), 'schedule_date'] = datetime.datetime(2017, 2, 5)
df.loc[(df.schedule_date == '1990-01-27') & (df.schedule_week == 21), 'schedule_date'] = datetime.datetime(1990, 1, 28)
df.loc[(df.schedule_date == '1990-01-13'), 'schedule_date'] = datetime.datetime(1990, 1, 14)
games_elo.loc[(games_elo.date == '2016-01-09'), 'date'] = datetime.datetime(2016, 1, 10)
games_elo.loc[(games_elo.date == '2016-01-08'), 'date'] = datetime.datetime(2016, 1, 9)
games_elo.loc[(games_elo.date == '2016-01-16'), 'date'] = datetime.datetime(2016, 1, 17)
games_elo.loc[(games_elo.date == '2016-01-15'), 'date'] = datetime.datetime(2016, 1, 16)

In [0]:
# merge games_elo with df
df = df.merge(games_elo, left_on=['schedule_date', 'team_home', 'team_away'], right_on=['date', 'team1', 'team2'], how='left')

# merge to fix neutral games where team_home and team_away are switched
games_elo2 = games_elo.rename(columns={'team1' : 'team2', 'team2' : 'team1', 'elo1' : 'elo2', 'elo2' : 'elo1'})
games_elo2['elo_prob1'] = 1 - games_elo2.elo_prob1
df = df.merge(games_elo2, left_on=['schedule_date', 'team_home', 'team_away'], right_on=['date', 'team1', 'team2'], how='left')

In [0]:
# separating merged columns into x and y cols
x_cols = ['date_x', 'season_x', 'neutral_x', 'playoff_x', 'team1_x', 'team2_x', 'elo1_x', 'elo2_x', 'elo_prob1_x', 'elo_prob2_x', 
          'qbelo_prob1_x', 'qbelo_prob2_x', 'score1_x', 'score2_x']
y_cols = ['date_y', 'season_y', 'neutral_y', 'playoff_y', 'team1_y', 'team2_y', 'elo1_y', 'elo2_y', 'elo_prob1_y', 'elo_prob2_y', 
          'qbelo_prob1_y', 'qbelo_prob2_y','score1_y', 'score2_y']

# filling null values for games_elo merged cols
for x, y in zip(x_cols, y_cols):
    df[x] = df[x].fillna(df[y]) 

# removing y_cols from dataframe    
df = df[['schedule_date', 'schedule_season', 'schedule_week', 'team_home',
       'team_away', 'team_favorite_id', 'spread_favorite', 'over_under_line',
       'weather_temperature', 'weather_wind_mph', 'score_home', 'score_away',
       'stadium_neutral', 'home_favorite', 'away_favorite', 'over', 'neutral_x', 'playoff_x',
         'elo1_x', 'elo2_x', 'elo_prob1_x','elo_prob2_x', 'qbelo_prob1_x', 'qbelo_prob2_x' ]]

# remove _x ending from column names
df.columns = df.columns.str.replace('_x', '')

In [0]:
# creating result column df.loc[(df.score_home > df.score_away), 'result'

# HOME WIN
df['result'] = (df.score_home > df.score_away).astype(int)

# OVER
#df['result'] = (df.over).astype(int)

# HOME WIN AND ATS
#df['result'] = (df.home_favorite == 1) & (df.score_home > df.score_away) & ((df.score_away - df.score_home) > df.spread_favorite).astype(int)

In [0]:
# all column names
df.columns


Out[0]:
Index(['schedule_date', 'schedule_season', 'schedule_week', 'team_home',
       'team_away', 'team_favorite_id', 'spread_favorite', 'over_under_line',
       'weather_temperature', 'weather_wind_mph', 'score_home', 'score_away',
       'stadium_neutral', 'home_favorite', 'away_favorite', 'over', 'neutral',
       'playoff', 'elo1', 'elo2', 'elo_prob1', 'elo_prob2', 'qbelo_prob1',
       'qbelo_prob2', 'result'],
      dtype='object')

In [0]:
# snapshot of data
df.tail(10)


Out[0]:
schedule_date schedule_season schedule_week team_home team_away team_favorite_id spread_favorite over_under_line weather_temperature weather_wind_mph score_home score_away stadium_neutral home_favorite away_favorite over neutral playoff elo1 elo2 elo_prob1 elo_prob2 qbelo_prob1 qbelo_prob2 result
9886 2019-09-22 2019 3 GB DEN GB -7.5 43.0 0.0 0.0 27.0 16.0 0 1.0 0.0 0.0 0.0 NaN 1507.994649 1419.884182 0.707107 0.292893 0.743268 0.256732 1
9887 2019-09-22 2019 3 KC BAL KC -5.5 55.0 0.0 0.0 33.0 28.0 0 1.0 0.0 1.0 0.0 NaN 1639.908760 1604.360622 0.640792 0.359208 0.690695 0.309305 1
9888 2019-09-22 2019 3 LAC HOU LAC -3.0 48.0 0.0 0.0 20.0 27.0 0 1.0 0.0 0.0 0.0 NaN 1583.703049 1513.605730 0.685180 0.314820 0.659790 0.340210 0
9889 2019-09-22 2019 3 MIN OAK MIN -9.0 42.5 0.0 0.0 34.0 14.0 0 1.0 0.0 1.0 0.0 NaN 1541.128678 1408.111377 0.757657 0.242343 0.737240 0.262760 1
9890 2019-09-22 2019 3 NE NYJ NE -20.5 45.5 0.0 0.0 30.0 14.0 0 1.0 0.0 0.0 0.0 NaN 1677.301131 1345.334919 0.907637 0.092363 0.912006 0.087994 1
9891 2019-09-22 2019 3 PHI DET PHI -5.5 47.5 0.0 0.0 24.0 27.0 0 1.0 0.0 1.0 0.0 NaN 1571.763505 1504.718793 0.681378 0.318622 0.698684 0.301316 0
9892 2019-09-22 2019 3 SF PIT SF -6.5 43.5 0.0 0.0 24.0 20.0 0 1.0 0.0 1.0 0.0 NaN 1493.503009 1538.911743 0.528164 0.471836 0.786794 0.213206 1
9893 2019-09-22 2019 3 SEA NO SEA -4.5 44.5 0.0 0.0 27.0 33.0 0 1.0 0.0 1.0 0.0 NaN 1581.220030 1592.947548 0.576070 0.423930 0.730921 0.269079 0
9894 2019-09-22 2019 3 TB NYG TB -6.0 48.0 0.0 0.0 31.0 32.0 0 1.0 0.0 1.0 0.0 NaN 1439.762740 1397.221631 0.650004 0.349996 0.685660 0.314340 0
9895 2019-09-23 2019 3 WAS CHI CHI -4.0 41.5 0.0 0.0 15.0 31.0 0 0.0 1.0 1.0 0.0 NaN 1414.227959 1563.071253 0.381629 0.618371 0.412243 0.587757 0

In [0]:
# null values by column
df.isnull().sum(axis=0)


Out[0]:
schedule_date             0
schedule_season           0
schedule_week             0
team_home                 0
team_away                 0
team_favorite_id          0
spread_favorite           0
over_under_line           0
weather_temperature     245
weather_wind_mph        245
score_home                0
score_away                0
stadium_neutral           0
home_favorite             0
away_favorite             0
over                      0
neutral                   4
playoff                9538
elo1                      4
elo2                      4
elo_prob1                 4
elo_prob2                 4
qbelo_prob1               4
qbelo_prob2               4
result                    0
dtype: int64

In [0]:
# summary statistics
df.describe().transpose()


Out[0]:
count mean std min 25% 50% 75% max
schedule_season 9896.0 1999.583670 11.427849 1979.000000 1990.000000 2000.000000 2009.000000 2019.000000
schedule_week 9896.0 9.251617 5.204222 1.000000 5.000000 9.000000 14.000000 21.000000
spread_favorite 9896.0 -5.368230 3.423312 -26.500000 -7.000000 -4.500000 -3.000000 0.000000
over_under_line 9896.0 41.870069 4.671214 28.000000 38.000000 41.500000 45.000000 63.500000
weather_temperature 9651.0 59.437260 15.979763 -6.000000 49.000000 63.000000 72.000000 97.000000
weather_wind_mph 9651.0 7.325251 5.702686 0.000000 1.000000 8.000000 11.000000 40.000000
score_home 9896.0 22.643290 10.393699 0.000000 16.000000 22.000000 30.000000 62.000000
score_away 9896.0 19.854082 10.041910 0.000000 13.000000 20.000000 27.000000 59.000000
stadium_neutral 9896.0 0.007882 0.088434 0.000000 0.000000 0.000000 0.000000 1.000000
home_favorite 9896.0 0.669867 0.470285 0.000000 0.000000 1.000000 1.000000 1.000000
away_favorite 9896.0 0.315683 0.464811 0.000000 0.000000 0.000000 1.000000 1.000000
over 9896.0 0.483630 0.499757 0.000000 0.000000 0.000000 1.000000 1.000000
neutral 9892.0 0.007279 0.085008 0.000000 0.000000 0.000000 0.000000 1.000000
elo1 9892.0 1507.540308 98.507019 1197.301000 1438.801657 1506.863000 1575.361750 1849.484000
elo2 9892.0 1506.790805 96.755981 1201.561463 1439.192750 1507.567000 1575.960250 1825.961000
elo_prob1 9892.0 0.582324 0.167005 0.070953 0.465624 0.592212 0.708533 0.964578
elo_prob2 9892.0 0.417369 0.166854 0.035422 0.291338 0.407719 0.534117 0.929047
qbelo_prob1 9892.0 0.575710 0.170887 0.059810 0.454738 0.586443 0.706254 0.967197
qbelo_prob2 9892.0 0.424290 0.170887 0.032803 0.293746 0.413557 0.545262 0.940190
result 9896.0 0.580234 0.493545 0.000000 0.000000 1.000000 1.000000 1.000000

In [0]:
# some percentages to take into consideration when betting
home_win = "{:.2f}".format((sum((df.result == 1) & (df.stadium_neutral == 0)) / len(df)) * 100)
away_win = "{:.2f}".format((sum((df.result == 0) & (df.stadium_neutral == 0)) / len(df)) * 100)
under_line = "{:.2f}".format((sum((df.score_home + df.score_away) < df.over_under_line) / len(df)) * 100)
over_line = "{:.2f}".format((sum((df.score_home + df.score_away) > df.over_under_line) / len(df)) * 100)

favored = "{:.2f}".format((sum(((df.home_favorite == 1) & (df.result == 1)) | ((df.away_favorite == 1) & (df.result == 0)))
                           / len(df)) * 100)

cover = "{:.2f}".format((sum(((df.home_favorite == 1) & ((df.score_away - df.score_home) < df.spread_favorite)) | 
                             ((df.away_favorite == 1) & ((df.score_home - df.score_away) < df.spread_favorite))) 
                         / len(df)) * 100)

ats = "{:.2f}".format((sum(((df.home_favorite == 1) & ((df.score_away - df.score_home) > df.spread_favorite)) | 
                           ((df.away_favorite == 1) & ((df.score_home - df.score_away) > df.spread_favorite))) 
                       / len(df)) * 100)

In [0]:
# print all percentages
print("Number of Games: " + str(len(df)))
print("Home Straight Up Win Percentage: " + home_win + "%")
print("Away Straight Up Win Percentage: " + away_win + "%")
print("Under Percentage: " + under_line + "%")
print("Over Percentage: " + over_line + "%")
print("Favored Win Percentage: " + favored + "%")
print("Cover The Spread Percentage: " + cover + "%")
print("Against The Spread Percentage: " + ats + "%")


Number of Games: 9896
Home Straight Up Win Percentage: 57.65%
Away Straight Up Win Percentage: 41.56%
Under Percentage: 49.76%
Over Percentage: 48.36%
Favored Win Percentage: 65.14%
Cover The Spread Percentage: 46.36%
Against The Spread Percentage: 49.42%

In [0]:
# creating 2 separate dataframes with the home teams / scores and the away teams / scores
score = df.groupby(['schedule_season', 'schedule_week', 'team_home']).mean()[['score_home', 'score_away']].reset_index()
aw_score = df.groupby(['schedule_season', 'schedule_week', 'team_away']).mean()[['score_home', 'score_away']].reset_index()

# create total pts column
score['point_diff'] = score.score_home - score.score_away
aw_score['point_diff'] = aw_score.score_away - aw_score.score_home

# append the two dataframes
score = score.append(aw_score, ignore_index=True, sort=True)

# fill null values
score.team_home.fillna(score.team_away, inplace=True)

# sort by season and week 
score.sort_values(['schedule_season', 'schedule_week'], ascending = [True, True], inplace=True)

# removing unneeded columns & changing column name 
score = score[['schedule_season', 'schedule_week', 'team_home', 'point_diff']]
score.rename(columns={'team_home' : 'team'}, inplace=True)

In [0]:
# dictionary of dataframes - separate dataframe for each team
tm_dict = {}
for key in score.team.unique():
    tm_dict[key] = score[score.team == key].reset_index(drop=True)

In [0]:
# dataframe to populate
pts_diff = pd.DataFrame()

# for loop to create a rolling average of the previous games for each season
for yr in score.schedule_season.unique():
    for tm in score.team.unique():
        data = tm_dict[tm].copy()
        data = data[data.schedule_season == yr]
        
        data.loc[:, 'avg_pts_diff'] = data.point_diff.shift().expanding().mean()
        
        pts_diff = pts_diff.append(data)

In [0]:
# merging to df and changing column names
df = df.merge(pts_diff[['schedule_season', 'schedule_week', 'team', 'avg_pts_diff']], 
              left_on=['schedule_season', 'schedule_week', 'team_home'], right_on=['schedule_season', 'schedule_week', 'team'],
              how='left')

df.rename(columns={'avg_pts_diff' : 'hm_avg_pts_diff'}, inplace=True)

df = df.merge(pts_diff[['schedule_season', 'schedule_week', 'team', 'avg_pts_diff']], 
              left_on=['schedule_season', 'schedule_week', 'team_away'], right_on=['schedule_season', 'schedule_week', 'team'],
              how='left')

df.rename(columns={'avg_pts_diff' : 'aw_avg_pts_diff'}, inplace=True)

In [0]:
# average point differential over entire season
total_season = pts_diff.groupby(['schedule_season', 'team']).mean()['point_diff'].reset_index()

In [0]:
# adding schedule week for merge and adding one to the season for predictions
total_season['schedule_week'] = 1
total_season['schedule_season'] += 1

In [0]:
# cleaning of columns
df = df[['schedule_date', 'schedule_season', 'schedule_week', 'team_home',
       'team_away', 'team_favorite_id', 'spread_favorite', 'over_under_line',
       'weather_temperature', 'weather_wind_mph', 'score_home', 'score_away', 'stadium_neutral', 'home_favorite',
       'away_favorite', 'hm_avg_pts_diff','aw_avg_pts_diff', 'elo1', 'elo2', 'elo_prob1', 'elo_prob2', 'qbelo_prob1', 'qbelo_prob2', 'over', 'result']]

In [0]:
# merge to have previous seasons average point differential
df = df.merge(total_season[['schedule_season', 'schedule_week', 'team', 'point_diff']], 
              left_on=['schedule_season', 'schedule_week', 'team_home'], right_on=['schedule_season', 'schedule_week', 'team'],
              how='left')

df.rename(columns={'point_diff' : 'hm_avg_diff'}, inplace=True)

df = df.merge(total_season[['schedule_season', 'schedule_week', 'team', 'point_diff']], 
              left_on=['schedule_season', 'schedule_week', 'team_away'], right_on=['schedule_season', 'schedule_week', 'team'],
              how='left')

df.rename(columns={'point_diff' : 'aw_avg_diff'}, inplace=True)

# fill null values
df.hm_avg_pts_diff.fillna(df.hm_avg_diff, inplace=True)
df.aw_avg_pts_diff.fillna(df.aw_avg_diff, inplace=True)

In [0]:
# cleaning of columns
df = df[['schedule_date', 'schedule_season', 'schedule_week', 'team_home',
       'team_away', 'team_favorite_id', 'spread_favorite', 'over_under_line',
       'weather_temperature', 'weather_wind_mph', 'score_home', 'score_away', 'stadium_neutral', 'home_favorite',
       'away_favorite', 'hm_avg_pts_diff','aw_avg_pts_diff', 'elo1', 'elo2', 'elo_prob1', 'elo_prob2', 'qbelo_prob1', 'qbelo_prob2', 'over', 'result']]

In [0]:
# removing all rows with null values
df = df.dropna(how='any',axis=0)

In [0]:
# initial features possible for model
X = df[['schedule_season', 'schedule_week', 'over_under_line', 'spread_favorite', 'weather_temperature', 'weather_wind_mph',
        'home_favorite', 'hm_avg_pts_diff','aw_avg_pts_diff', 'elo1', 'elo2', 'elo_prob1', 'elo_prob2', 'qbelo_prob1', 'qbelo_prob2']]

y = df['result']

In [0]:
# base model
base = LDA()

# choose 5 best features
rfe = RFE(base, 5)
rfe = rfe.fit(X, y)

# features
print(rfe.support_)
print(rfe.ranking_)


/usr/local/lib/python3.6/dist-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
/usr/local/lib/python3.6/dist-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
/usr/local/lib/python3.6/dist-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
/usr/local/lib/python3.6/dist-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
/usr/local/lib/python3.6/dist-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
/usr/local/lib/python3.6/dist-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
/usr/local/lib/python3.6/dist-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
[False False False False False False  True False False False False  True
  True  True  True]
[ 9  4  8  2 10  7  1  3 11  6  5  1  1  1  1]
/usr/local/lib/python3.6/dist-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
/usr/local/lib/python3.6/dist-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
/usr/local/lib/python3.6/dist-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
/usr/local/lib/python3.6/dist-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")

In [0]:
# best 5 features chosen by the RFE base model
final_x = df[['spread_favorite', 'home_favorite', 'hm_avg_pts_diff', 'elo_prob1', 'elo_prob2', 'qbelo_prob1', 'qbelo_prob2']]
final_x


Out[0]:
spread_favorite home_favorite hm_avg_pts_diff elo_prob1 elo_prob2 qbelo_prob1 qbelo_prob2
14 -2.0 1.0 10.000000 0.656962 0.343038 0.645891 0.354109
15 0.0 0.0 -14.000000 0.566285 0.433715 0.533151 0.466849
16 -3.0 0.0 -2.000000 0.502268 0.497732 0.474937 0.525063
17 -6.0 1.0 3.000000 0.572482 0.427518 0.567900 0.432100
18 0.0 0.0 -15.000000 0.587308 0.412692 0.493688 0.506312
19 -3.0 1.0 -3.000000 0.620610 0.379390 0.611216 0.388784
20 0.0 0.0 14.000000 0.492412 0.507588 0.480420 0.519580
21 -7.0 1.0 2.000000 0.744615 0.255385 0.734354 0.265646
22 -8.0 1.0 -3.000000 0.684211 0.315789 0.642013 0.357987
23 -3.0 0.0 -6.000000 0.515880 0.484120 0.514806 0.485194
24 -6.0 1.0 3.000000 0.733198 0.266802 0.736662 0.263338
25 -3.0 1.0 17.000000 0.605790 0.394210 0.585195 0.414805
26 -13.0 0.0 -6.000000 0.203429 0.796571 0.195525 0.804475
27 -4.0 1.0 6.000000 0.636106 0.363894 0.624739 0.375261
28 -3.0 0.0 5.000000 0.516870 0.483130 0.523518 0.476482
29 -6.0 0.0 -18.500000 0.424853 0.575147 0.397621 0.602379
30 -7.0 1.0 3.000000 0.757372 0.242628 0.767868 0.232132
31 -9.0 1.0 4.500000 0.782305 0.217695 0.817339 0.182661
32 -2.0 1.0 3.000000 0.642566 0.357434 0.641037 0.358963
33 -5.0 1.0 -14.500000 0.768188 0.231812 0.737951 0.262049
34 -13.0 1.0 -1.500000 0.827084 0.172916 0.812240 0.187760
35 -7.0 0.0 -6.500000 0.439397 0.560603 0.432023 0.567977
36 -3.0 1.0 -7.500000 0.463061 0.536939 0.455162 0.544838
37 -6.0 1.0 -28.000000 0.601852 0.398148 0.659437 0.340563
38 -9.0 1.0 18.500000 0.780578 0.219422 0.780983 0.219017
39 -3.0 1.0 -13.000000 0.453306 0.546694 0.430703 0.569297
40 -6.0 0.0 6.000000 0.316497 0.683503 0.305767 0.694233
41 -6.0 1.0 0.500000 0.695947 0.304053 0.703896 0.296104
42 -3.0 1.0 5.666667 0.549427 0.450573 0.551861 0.448139
43 -3.0 0.0 -14.333333 0.390170 0.609830 0.316901 0.683099
... ... ... ... ... ... ... ...
9990 -12.5 0.0 49.000000 0.832054 0.167946 0.805902 0.194098
9991 0.0 1.0 -1.000000 0.546993 0.453007 0.466439 0.533561
9992 -3.0 0.0 -8.000000 0.413819 0.586181 0.459613 0.540387
9993 -1.0 0.0 0.000000 0.434138 0.565862 0.409146 0.590854
9994 -3.0 1.0 7.000000 0.493512 0.506488 0.557226 0.442774
9995 -3.0 1.0 30.000000 0.601295 0.398705 0.676157 0.323843
9996 -7.5 1.0 -2.000000 0.690490 0.309510 0.804020 0.195980
9997 -7.0 0.0 8.000000 0.319582 0.680418 0.306070 0.693930
9998 -2.0 1.0 3.000000 0.584798 0.415202 0.556322 0.443678
9999 -18.0 0.0 -49.000000 0.230725 0.769275 0.186361 0.813639
10000 -1.5 0.0 -18.000000 0.553512 0.446488 0.482772 0.517228
10001 -4.0 1.0 -30.000000 0.569004 0.430996 0.584538 0.415462
10002 -6.0 0.0 -5.000000 0.409885 0.590115 0.377841 0.622159
10003 -6.5 0.0 -1.000000 0.531159 0.468841 0.500309 0.499691
10004 -2.0 0.0 -7.500000 0.437474 0.562526 0.331980 0.668020
10005 -2.5 1.0 -3.000000 0.455128 0.544872 0.611187 0.388813
10006 -1.5 1.0 -2.000000 0.651773 0.348227 0.538457 0.461543
10007 -6.0 1.0 7.500000 0.716570 0.283430 0.737074 0.262926
10008 -4.0 0.0 -5.000000 0.338904 0.661096 0.390260 0.609740
10009 -22.0 1.0 14.000000 0.829257 0.170743 0.910799 0.089201
10010 -7.5 1.0 6.000000 0.707107 0.292893 0.743268 0.256732
10011 -5.5 1.0 16.000000 0.640792 0.359208 0.690695 0.309305
10012 -3.0 1.0 1.500000 0.685180 0.314820 0.659790 0.340210
10013 -9.0 1.0 5.500000 0.757657 0.242343 0.737240 0.262760
10014 -20.5 1.0 36.500000 0.907637 0.092363 0.912006 0.087994
10015 -5.5 1.0 0.500000 0.681378 0.318622 0.698684 0.301316
10016 -6.5 1.0 19.000000 0.528164 0.471836 0.786794 0.213206
10017 -4.5 1.0 1.500000 0.576070 0.423930 0.730921 0.269079
10018 -6.0 1.0 -4.000000 0.650004 0.349996 0.685660 0.314340
10019 -4.0 0.0 -7.500000 0.381629 0.618371 0.412243 0.587757

9745 rows × 7 columns


In [0]:
# prepare models
models = []

models.append(('LRG', LogisticRegression(solver='liblinear')))
models.append(('KNB', KNeighborsClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('XGB', xgb.XGBClassifier(random_state=0)))
models.append(('RFC', RandomForestClassifier(random_state=0, n_estimators=100)))
models.append(('DTC', DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=5)))

# evaluate each model by average and standard deviations of roc auc 
results = []
names = []

for name, m in models:
    kfold = model_selection.KFold(n_splits=5, random_state=0)
    cv_results = model_selection.cross_val_score(m, final_x, y, cv=kfold, scoring = 'roc_auc')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


LRG: 0.690277 (0.015071)
KNB: 0.628918 (0.005377)
GNB: 0.689834 (0.013784)
XGB: 0.692945 (0.013838)
RFC: 0.654952 (0.011307)
DTC: 0.687032 (0.010799)

In [0]:
# training and testing data (2017 and 2018)
train = df.copy()
test = df.copy()
train = train.loc[train['schedule_season'] < 2019]
test = test.loc[test['schedule_season'] > 2010]
X_train = train[['over_under_line', 'spread_favorite', 'home_favorite', 'hm_avg_pts_diff', 'elo_prob1', 'qbelo_prob1', 'qbelo_prob2']]
y_train = train['result']
X_test = test[['over_under_line', 'spread_favorite', 'home_favorite', 'hm_avg_pts_diff', 'elo_prob1', 'qbelo_prob1', 'qbelo_prob2',]]
y_test = test['result']

train.head()


Out[0]:
schedule_date schedule_season schedule_week team_home team_away team_favorite_id spread_favorite over_under_line weather_temperature weather_wind_mph score_home score_away stadium_neutral home_favorite away_favorite hm_avg_pts_diff aw_avg_pts_diff elo1 elo2 elo_prob1 elo_prob2 qbelo_prob1 qbelo_prob2 over result
14 1979-09-06 1979 2 DEN LAR DEN -2.0 31.5 67.0 8.0 9.0 13.0 0 1.0 0.0 10.0 -7.0 1592.275 1544.396 0.656962 0.343038 0.645891 0.354109 0.0 0
15 1979-09-09 1979 2 IND TB PICK 0.0 33.0 64.0 8.0 26.0 29.0 0 0.0 0.0 -14.0 15.0 1397.713 1416.381 0.566285 0.433715 0.533151 0.466849 1.0 0
16 1979-09-09 1979 2 BUF CIN CIN -3.0 34.0 54.0 7.0 51.0 24.0 0 0.0 1.0 -2.0 -10.0 1414.802 1478.226 0.502268 0.497732 0.474937 0.525063 1.0 1
17 1979-09-09 1979 2 CHI MIN CHI -6.0 31.5 60.0 11.0 26.0 7.0 0 1.0 0.0 3.0 6.0 1495.817 1510.094 0.572482 0.427518 0.567900 0.432100 1.0 1
18 1979-09-09 1979 2 DET WAS PICK 0.0 35.5 72.0 0.0 24.0 27.0 0 0.0 0.0 -15.0 -2.0 1455.863 1459.567 0.587308 0.412692 0.493688 0.506312 1.0 0

In [0]:
# calibrate probabilities and fit model to training data
boost = xgb.XGBClassifier()
dtc = DecisionTreeClassifier(max_depth=5, criterion='entropy')
lrg = LogisticRegression(solver='liblinear')
vote = VotingClassifier(estimators=[('boost', boost), ('dtc', dtc), ('lrg', lrg)], voting='soft')

model = CCV(vote, method='isotonic', cv=3)
model.fit(X_train, y_train)


Out[0]:
CalibratedClassifierCV(base_estimator=VotingClassifier(estimators=[('boost',
                                                                    XGBClassifier(base_score=0.5,
                                                                                  booster='gbtree',
                                                                                  colsample_bylevel=1,
                                                                                  colsample_bynode=1,
                                                                                  colsample_bytree=1,
                                                                                  gamma=0,
                                                                                  learning_rate=0.1,
                                                                                  max_delta_step=0,
                                                                                  max_depth=3,
                                                                                  min_child_weight=1,
                                                                                  missing=None,
                                                                                  n_estimators=100,
                                                                                  n_jobs=1,
                                                                                  nthread=None,
                                                                                  objective='binary:logistic',
                                                                                  random_state=0,
                                                                                  r...
                                                                                           splitter='best')),
                                                                   ('lrg',
                                                                    LogisticRegression(C=1.0,
                                                                                       class_weight=None,
                                                                                       dual=False,
                                                                                       fit_intercept=True,
                                                                                       intercept_scaling=1,
                                                                                       l1_ratio=None,
                                                                                       max_iter=100,
                                                                                       multi_class='warn',
                                                                                       n_jobs=None,
                                                                                       penalty='l2',
                                                                                       random_state=None,
                                                                                       solver='liblinear',
                                                                                       tol=0.0001,
                                                                                       verbose=0,
                                                                                       warm_start=False))],
                                                       flatten_transform=True,
                                                       n_jobs=None,
                                                       voting='soft',
                                                       weights=None),
                       cv=3, method='isotonic')

In [0]:
# predict probabilities
predicted = model.predict_proba(X_test)[:,1]

In [0]:
# ROC AUC Score higher is better while Brier Score the lower the better
print("Metrics" + "\t\t" + "My Model" + "\t" + "Elo Results")
print("ROC AUC Score: " +  "\t" + "{:.4f}".format(roc_auc_score(y_test, predicted)) + "\t\t" + "{:.4f}".format(roc_auc_score(test.result, test.elo_prob1)))
print("Brier Score: " + "\t" + "{:.4f}".format(brier_score_loss(y_test, predicted)) + "\t\t" + "{:.4f}".format(brier_score_loss(test.result, test.elo_prob1)))


Metrics		My Model	Elo Results
ROC AUC Score: 	0.7363		0.6924
Brier Score: 	0.2039		0.2175

In [0]:
# creating a column with the models probabilities to analyze vs elo fivethirtyeight
test.loc[:,'hm_prob'] = predicted
test = test[['schedule_season', 'schedule_week', 'team_home', 'team_away', 'elo_prob1', 'hm_prob', 'result']]

In [0]:
# calulate bets won (only make a bet when probability is greater than / equal to 60% or less than / equal to 40%)
test['my_bet_won'] = (((test.hm_prob >= 0.60) & (test.result == 1)) | ((test.hm_prob <= 0.40) & (test.result == 0))).astype(int)
test['elo_bet_won'] = (((test.elo_prob1 >= 0.60) & (test.result == 1)) | ((test.elo_prob1 <= 0.40) & (test.result == 0))).astype(int)

# calulate bets lost (only make a bet when probability is greater than / equal to 60% or less than / equal to 40%)
test['my_bet_lost'] = (((test.hm_prob >= 0.60) & (test.result == 0)) | ((test.hm_prob <= 0.40) & (test.result == 1))).astype(int)
test['elo_bet_lost'] = (((test.elo_prob1 >= 0.60) & (test.result == 0)) | ((test.elo_prob1 <= 0.40) & (test.result == 1))).astype(int)

In [0]:
# printing some quick overall results for my model
print("My Model Win Percentage: " + "{:.4f}".format(test.my_bet_won.sum() / (test.my_bet_lost.sum() + test.my_bet_won.sum())))
print("Total Number of Bets Won: " + str(test.my_bet_won.sum()))
print("Total Number of Bets Made: " + str((test.my_bet_lost.sum() + test.my_bet_won.sum())))
print("Possible Games: " + str(len(test)))


My Model Win Percentage: 0.7449
Total Number of Bets Won: 1057
Total Number of Bets Made: 1419
Possible Games: 2151

In [0]:
# printing some quick overall results for fivethirtyeight's ELO model
print("ELO Model Win Percentage: " + "{:.4f}".format(test.elo_bet_won.sum()/(test.elo_bet_lost.sum() + test.elo_bet_won.sum())))
print("Total Number of Bets Won: " + str(test.elo_bet_won.sum()))
print("Total Number of Bets Made: " + str((test.elo_bet_lost.sum() + test.elo_bet_won.sum())))
print("Possible Games: " + str(len(test)))


ELO Model Win Percentage: 0.7015
Total Number of Bets Won: 973
Total Number of Bets Made: 1387
Possible Games: 2151

In [0]:
# creating week by week results
results_df = test.groupby(['schedule_season', 'schedule_week']).agg({'team_home' : 'count', 'my_bet_won' : 'sum', 
'elo_bet_won' : 'sum', 'my_bet_lost' : 'sum', 'elo_bet_lost' : 'sum'}).reset_index().rename(columns=
                                                                                            {'team_home' : 'total_games'})

# counting total bets for my model and the ELO model (prob >= 60% or prob <= 40%)
results_df['total_bets'] = results_df.my_bet_won + results_df.my_bet_lost
results_df['elo_total_bets'] = results_df.elo_bet_won + results_df.elo_bet_lost

# creating accuracy columns based on bets made not on total games
results_df['bet_accuracy'] = round((results_df.my_bet_won / results_df.total_bets) * 100, 2)
results_df['elo_bet_accuracy'] = round((results_df.elo_bet_won / results_df.elo_total_bets) * 100, 2)
results_df = results_df[['schedule_season', 'schedule_week', 'bet_accuracy', 'elo_bet_accuracy',
                         'total_bets', 'elo_total_bets', 'total_games']]

In [0]:
results_df.tail()


Out[0]:
schedule_season schedule_week bet_accuracy elo_bet_accuracy total_bets elo_total_bets total_games
160 2018 20 0.00 0.00 2 2 2
161 2018 21 NaN NaN 0 0 1
162 2019 1 83.33 81.82 12 11 16
163 2019 2 83.33 66.67 6 6 16
164 2019 3 69.23 75.00 13 12 16

In [0]:
model.save("nflmodel1.h5")


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-42-84f49ef6abd9> in <module>()
----> 1 model.save("nflmodel1.h5")

AttributeError: 'CalibratedClassifierCV' object has no attribute 'save'