In [1]:
import pandas as pd
import numpy as np
import sqlalchemy as sql
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestRegressor
# for custom addition to random forest
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.fixes import parallel_helper
from sklearn.ensemble.base import _partition_estimators
from sklearn.externals.joblib import Parallel, delayed
%matplotlib inline
In [2]:
# load the data
engine = sql.create_engine('sqlite:///data/kaggleData.sqlite')
teams = pd.read_sql_table('Teams', engine)
allSeasonGames = pd.read_sql_table('RegularSeasonDetailedResults', engine)
allTourneyGames = pd.read_sql_table('TourneyDetailedResults', engine)
allTourneySeeds = pd.read_sql_table('TourneySeeds', engine)
allKenPomData = pd.read_csv('data/kenPomTeamData.csv')
allKenPomData = allKenPomData[ [ col for col in allKenPomData if 'Rank' not in col ] ]
Simple model: determine price from inputs
Note: not using all-in efficiency margin but perhaps should in future
Train this to the outcomes of games, accounting for the below. The below factors will transpose the points distributions
Then, using RF regression, generate two PDF's for scores and infer win probabilities
In [3]:
offFeatureNames = [ 'AdjOE', 'ORC', 'ORPF', 'ORPG', 'ORSG', 'ORSF' ]
defFeatureNames = [ 'AdjDE', 'DRC', 'ORPF', 'DRPG', 'DRSG', 'DRSF' ]
teamFeatureNames = [ 'Height', 'AdjTempo', 'ARate', 'Bench' ]
featureNames = offFeatureNames + defFeatureNames + teamFeatureNames
In [13]:
def avgHomeAwayPointDiff(seasonGames):
homeAwayGames = seasonGames[seasonGames['Wloc'] != 'N']
homePoints = np.where(homeAwayGames['Wloc'] == 'H', homeAwayGames['Wscore'], homeAwayGames['Lscore'])
awayPoints = np.where(homeAwayGames['Wloc'] == 'A', homeAwayGames['Wscore'], homeAwayGames['Lscore'])
return homePoints.mean() - awayPoints.mean()
def featureGeneration(kenPomData, teamID, oppTeamID):
kpd = kenPomData[kenPomData['Team_Id'] == teamID]
kpdOpp = kenPomData[kenPomData['Team_Id'] == oppTeamID]
offFeatures = [ kpd[f].values[0] for f in offFeatureNames ]
defFeatures = [ kpdOpp[f].values[0] for f in defFeatureNames ]
teamFeatures = [ kpd[f].values[0] - kpdOpp[f].values[0] for f in teamFeatureNames ]
return offFeatures + defFeatures + teamFeatures
def generateTrainingSet(seasonGames, kenPomData, homeAwayCorrection):
samples, target = [], []
for index, row in seasonGames[['Wteam', 'Lteam', 'Wscore', 'Lscore', 'Wloc']].iterrows():
wteam, lteam, wscore, lscore, wloc = row
wCorrection = 0 if wloc == 'N' else (homeAwayCorrection/2.0 if wloc == 'A' else -homeAwayCorrection/2.0)
lCorrection = 0 if wloc == 'N' else (homeAwayCorrection/2.0 if wloc == 'H' else -homeAwayCorrection/2.0)
wFeatures, wTarget = featureGeneration(kenPomData, wteam, lteam), wscore + wCorrection
lFeatures, lTarget = featureGeneration(kenPomData, lteam, wteam), lscore + lCorrection
samples += [ wFeatures, lFeatures ]
target += [ wTarget, lTarget]
return samples, target
def trimQuartiles(arr, percentile):
lowerQuantile = np.percentile(arr, percentile)
upperQuantile = np.percentile(arr, 100 - percentile)
return [ s for s in arr if s >= lowerQuantile and s <= upperQuantile ]
def probabilityGreaterThan(arr1, arr2): # returns probability ran # from arr1 is greater than ran # from arr2
arr1_srt, arr2_srt = sorted(arr1), sorted(arr2)
probs = []
idx2 = 0
for idx1 in range(len(arr1_srt)):
while idx2 < len(arr2_srt) and arr2_srt[idx2] < arr1_srt[idx1]:
idx2 += 1
probs += [ idx2 / len(arr2_srt) ]
return np.mean(probs)
In [5]:
# copy source from https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/ensemble/forest.py
# but give access to individual tree predictions
class RandomForestRegressorAugmented(RandomForestRegressor):
def subPredictions(self, X):
check_is_fitted(self, 'estimators_')
# Check data
X = self._validate_X_predict(X)
# Assign chunk of trees to jobs
n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
# Parallel loop
all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose,
backend="threading")(
delayed(parallel_helper)(e, 'predict', X, check_input=False)
for e in self.estimators_)
return np.array(all_y_hat).T
In [6]:
# filter to target season
season = 2016
seasonGames = allSeasonGames[ allSeasonGames['Season'] == season ]
tourneyGames = allTourneyGames[ allTourneyGames['Season'] == season ]
tourneySeeds = allTourneySeeds[ allTourneySeeds['Season'] == season ]
kenPomData = allKenPomData[ allKenPomData['Season'] == season ]
In [7]:
# points correction factors
homeAwayCorrection = avgHomeAwayPointDiff(seasonGames)
In [8]:
# build up training set
# note this is a bit slow so perhaps use map to speed things up?
samples, target = generateTrainingSet(seasonGames, kenPomData, homeAwayCorrection)
In [10]:
# train our random forest
# note: removing first two samples to avoid over-fitting when
rfr_test = RandomForestRegressorAugmented(n_estimators = 500)
rfr_test.fit(samples[2:], target[2:])
Out[10]:
In [25]:
# what mattered?
b = sns.barplot(x = np.arange(len(featureNames)), y = rfr_test.feature_importances_)
b.set_xticklabels(featureNames, rotation = 90)
b.set_ylabel('Feature importance')
b.set_xlabel('Feature name')
plt.show()
In [224]:
# how does it do against the first game between alabama and Kennesaw St. (if train set is 2016)
predictedScores = rfr_test.subPredictions([samples[0], samples[1]])
print(np.mean(predictedScores[0]), target[0])
print(np.mean(predictedScores[1]), target[1])
sns.distplot(predictedScores[0], bins = 30)
sns.distplot(predictedScores[1], bins = 30)
plt.show()
In [221]:
# not bad, but it looks like there are some outliers -- what happens if we take inner 90 percent?
# looks a little better
outlierThresh = 5
predictedScoresRemovedOutliers = trimQuartiles(predictedScores[0], outlierThresh)
print(np.mean(predictedScoresRemovedOutliers), target[0])
In [222]:
# what is the probability Alabama wins on a neutral court? Include random tie breaker since RF sometimes gives ties
# note this season Alabama when 18-15, kneesaw went 11-20 -- adj margin diff was 16 pts between two teams
# surely favors Alabama, but seems within reason that ~15% of the time kneesaw would win
psHumbled = np.array([ np.array(trimQuartiles(p, outlierThresh)) for p in predictedScores ])
psTieBreaker = 1e-6*np.random.rand(*psHumbled.shape) + psHumbled
probabilityGreaterThan(psTieBreaker[0], psTieBreaker[1])
Out[222]:
In [244]:
# what if I train to one season and predict in another? Looks pretty good!
otherSeason = 2015
otherSeasonGame = allSeasonGames[ allSeasonGames['Season'] == otherSeason ][:1]
otherKenPomData = allKenPomData[ allKenPomData['Season'] == otherSeason ]
otherSamples =\
[ featureGeneration(otherKenPomData, otherSeasonGame['Wteam'].values[0], otherSeasonGame['Lteam'].values[0]),
featureGeneration(otherKenPomData, otherSeasonGame['Lteam'].values[0], otherSeasonGame['Wteam'].values[0]) ]
otherPredictedScores = rfr_test.subPredictions(otherSamples)
print(np.mean(otherPredictedScores[0]), otherSeasonGame['Wscore'].values[0] - homeAwayCorrection/2.0)
print(np.mean(otherPredictedScores[1]), otherSeasonGame['Lscore'].values[0] + homeAwayCorrection/2.0)
sns.distplot(otherPredictedScores[0], bins = 30)
sns.distplot(otherPredictedScores[1], bins = 30)
plt.show()
In [245]:
psHumbled = np.array([ np.array(trimQuartiles(p, outlierThresh)) for p in otherPredictedScores ])
psTieBreaker = 1e-6*np.random.rand(*psHumbled.shape) + psHumbled
probabilityGreaterThan(psTieBreaker[0], psTieBreaker[1])
Out[245]:
For now, forge ahead with using previous season to train to prevent over-fitting. So, use 2015 regular season to preduct 2016 post-season.
In [41]:
# filter to target season
trainSeason, testSeason = 2014, 2015
trainSeasonGames = allSeasonGames[ allSeasonGames['Season'] == trainSeason ]
trainTourneyGames = allTourneyGames[ allTourneyGames['Season'] == trainSeason ]
trainTourneySeeds = allTourneySeeds[ allTourneySeeds['Season'] == trainSeason ]
trainKenPomData = allKenPomData[ allKenPomData['Season'] == trainSeason ]
testSeasonGames = allSeasonGames[ allSeasonGames['Season'] == testSeason ]
testTourneyGames = allTourneyGames[ allTourneyGames['Season'] == testSeason ]
testTourneySeeds = allTourneySeeds[ allTourneySeeds['Season'] == testSeason ]
testKenPomData = allKenPomData[ allKenPomData['Season'] == testSeason ]
In [42]:
# points correction factors
homeAwayCorrection = avgHomeAwayPointDiff(trainSeasonGames)
In [43]:
# build up training set
# note this is a bit slow so perhaps use map to speed things up?
samples, target = generateTrainingSet(trainSeasonGames, trainKenPomData, homeAwayCorrection)
In [44]:
# train our random forest
# note: removing first two samples to avoid over-fitting when
rfr = RandomForestRegressorAugmented(n_estimators = 500)
rfr.fit(samples, target)
Out[44]:
In [53]:
# predictions for all matchups in playoffs
tourneyTeams = list(set(testTourneySeeds['Team']))
predictions, keys = [], []
for team1 in tourneyTeams:
for team2 in tourneyTeams:
if team1 < team2:
team1Sample = featureGeneration(kenPomData, team1, team2)
team2Sample = featureGeneration(kenPomData, team2, team1)
scores = rfr.subPredictions([ team1Sample, team2Sample ])
# probabilities
psHumbled = np.array([ np.array(trimQuartiles(s, outlierThresh)) for s in scores ])
psTieBreaker = 1e-8*np.random.rand(*psHumbled.shape) + psHumbled
#psTieBreaker = 1e-8*np.random.rand(*scores.shape) + scores
winProbabiity = probabilityGreaterThan(psTieBreaker[0], psTieBreaker[1])
# capture
keys += [ '_'.join(map(str, [testSeason, team1, team2])) ]
predictions += [ winProbabiity ]
predictions = pd.DataFrame({ 'Id' : keys, 'Pred' : predictions}).set_index('Id')
In [57]:
# check the performance
results = testTourneyGames.copy()[['Wteam', 'Lteam']]
results['minTeam'] = np.where(results['Wteam'] < results['Lteam'], results['Wteam'], results['Lteam'])
results['maxTeam'] = np.where(results['Wteam'] > results['Lteam'], results['Wteam'], results['Lteam'])
results['Win'] = np.where(results['Wteam'] == results['minTeam'], 1, 0)
results['Id'] = str(season) + '_' + results['minTeam'].map(str).str.cat(results['maxTeam'].map(str), sep = '_')
results = results[['Id', 'Win']].set_index('Id')
resultsVsReal = results.join(predictions)
log_loss(resultsVsReal['Win'], resultsVsReal['Pred'])
Out[57]:
In [55]:
print(np.array(resultsVsReal))
In [40]:
# submit at your own risk
# this blew up on 2016, seems susceptible to upsets
predictions[['Pred']].to_csv('submissions/rfRegressorTest.csv')