In [1]:
import pandas as pd
import numpy as np
import sqlalchemy as sql
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestRegressor

# for custom addition to random forest
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.fixes import parallel_helper
from sklearn.ensemble.base import _partition_estimators
from sklearn.externals.joblib import Parallel, delayed

%matplotlib inline

In [2]:
# load the data

engine = sql.create_engine('sqlite:///data/kaggleData.sqlite')
teams = pd.read_sql_table('Teams', engine)
allSeasonGames = pd.read_sql_table('RegularSeasonDetailedResults', engine)
allTourneyGames = pd.read_sql_table('TourneyDetailedResults', engine)
allTourneySeeds = pd.read_sql_table('TourneySeeds', engine)

allKenPomData = pd.read_csv('data/kenPomTeamData.csv')
allKenPomData = allKenPomData[ [ col for col in allKenPomData if 'Rank' not in col ] ]

A random forest regressor

Simple model: determine price from inputs

  1. Scoring capacity
    1. AdjOE
    2. Position ratings (ORC, ORPF, ORPG, ORSG, ORSF)
  2. Defensive capacity
    1. Opponent AdjDE
    2. Opponent position ratings (DRC, DRPF, DRPG, DRSG, DRSF)
  3. Team characteristics -- these are differences between two teams
    1. Height, tempo, assists, bench
    2. Free throws??
    3. Turnover, blocks, rebounds, etc. (not yet included)
    4. Coach quality, funding (not yet included)
    5. Difference of team ELO (not yet included)

Note: not using all-in efficiency margin but perhaps should in future

Train this to the outcomes of games, accounting for the below. The below factors will transpose the points distributions

  1. Home vs Away
    1. Constant point correction for season
  2. Strength of schedule (not yet included, but somewhat in AdjOE / AdjDE)
    1. Pull in multiple different strength ratings
    2. Adjust player-level as well
  3. Overtime in previous games (not yet included)
  4. Injuries (not yet included)

Then, using RF regression, generate two PDF's for scores and infer win probabilities

Future directions

  1. Extend model as per above
  2. Incorporate home / away in postseason prediction
  3. Train across seasons or including post-seasons, with different weighting

In [3]:
offFeatureNames =  [ 'AdjOE', 'ORC', 'ORPF', 'ORPG', 'ORSG', 'ORSF' ]
defFeatureNames =  [ 'AdjDE', 'DRC', 'ORPF', 'DRPG', 'DRSG', 'DRSF' ]
teamFeatureNames = [ 'Height', 'AdjTempo', 'ARate', 'Bench' ]
featureNames = offFeatureNames + defFeatureNames + teamFeatureNames

In [13]:
def avgHomeAwayPointDiff(seasonGames):
    homeAwayGames = seasonGames[seasonGames['Wloc'] != 'N']
    homePoints = np.where(homeAwayGames['Wloc'] == 'H', homeAwayGames['Wscore'], homeAwayGames['Lscore'])
    awayPoints = np.where(homeAwayGames['Wloc'] == 'A', homeAwayGames['Wscore'], homeAwayGames['Lscore'])
    return homePoints.mean() - awayPoints.mean()

def featureGeneration(kenPomData, teamID, oppTeamID):
    kpd =    kenPomData[kenPomData['Team_Id'] == teamID]
    kpdOpp = kenPomData[kenPomData['Team_Id'] == oppTeamID]
    offFeatures =  [ kpd[f].values[0] for f in offFeatureNames ]
    defFeatures =  [ kpdOpp[f].values[0] for f in defFeatureNames ]
    teamFeatures = [ kpd[f].values[0] - kpdOpp[f].values[0] for f in teamFeatureNames ]
    return offFeatures + defFeatures + teamFeatures

def generateTrainingSet(seasonGames, kenPomData, homeAwayCorrection):
    samples, target = [], []
    for index, row in seasonGames[['Wteam', 'Lteam', 'Wscore', 'Lscore', 'Wloc']].iterrows():
        wteam, lteam, wscore, lscore, wloc = row
        wCorrection = 0 if wloc == 'N' else (homeAwayCorrection/2.0 if wloc == 'A' else -homeAwayCorrection/2.0)
        lCorrection = 0 if wloc == 'N' else (homeAwayCorrection/2.0 if wloc == 'H' else -homeAwayCorrection/2.0)
        wFeatures, wTarget = featureGeneration(kenPomData, wteam, lteam), wscore + wCorrection
        lFeatures, lTarget = featureGeneration(kenPomData, lteam, wteam), lscore + lCorrection
        samples += [ wFeatures, lFeatures ]
        target  += [ wTarget,   lTarget]     
    return samples, target

def trimQuartiles(arr, percentile):
    lowerQuantile = np.percentile(arr, percentile)
    upperQuantile = np.percentile(arr, 100 - percentile)
    return [ s for s in arr if s >= lowerQuantile and s <= upperQuantile ]

def probabilityGreaterThan(arr1, arr2): # returns probability ran # from arr1 is greater than ran # from arr2
    arr1_srt, arr2_srt = sorted(arr1), sorted(arr2)
    probs = []
    idx2 = 0
    for idx1 in range(len(arr1_srt)):
        while idx2 < len(arr2_srt) and arr2_srt[idx2] < arr1_srt[idx1]:
            idx2 += 1
        probs += [ idx2 / len(arr2_srt) ]
    return np.mean(probs)

In [5]:
# copy source from https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/ensemble/forest.py
# but give access to individual tree predictions

class RandomForestRegressorAugmented(RandomForestRegressor):
    
    def subPredictions(self, X):
        check_is_fitted(self, 'estimators_')
        # Check data
        X = self._validate_X_predict(X)

        # Assign chunk of trees to jobs
        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

        # Parallel loop
        all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose,
                             backend="threading")(
            delayed(parallel_helper)(e, 'predict', X, check_input=False)
            for e in self.estimators_)
        
        return np.array(all_y_hat).T

In [6]:
# filter to target season

season = 2016

seasonGames = allSeasonGames[ allSeasonGames['Season'] == season ]
tourneyGames = allTourneyGames[ allTourneyGames['Season'] == season ]
tourneySeeds = allTourneySeeds[ allTourneySeeds['Season'] == season ]
kenPomData = allKenPomData[ allKenPomData['Season'] == season ]

In [7]:
# points correction factors

homeAwayCorrection = avgHomeAwayPointDiff(seasonGames)

In [8]:
# build up training set
# note this is a bit slow so perhaps use map to speed things up?

samples, target = generateTrainingSet(seasonGames, kenPomData, homeAwayCorrection)

In [10]:
# train our random forest 
# note: removing first two samples to avoid over-fitting when 

rfr_test = RandomForestRegressorAugmented(n_estimators = 500)
rfr_test.fit(samples[2:], target[2:])


Out[10]:
RandomForestRegressorAugmented(bootstrap=True, criterion='mse',
                max_depth=None, max_features='auto', max_leaf_nodes=None,
                min_impurity_split=1e-07, min_samples_leaf=1,
                min_samples_split=2, min_weight_fraction_leaf=0.0,
                n_estimators=500, n_jobs=1, oob_score=False,
                random_state=None, verbose=0, warm_start=False)

In [25]:
# what mattered?

b = sns.barplot(x = np.arange(len(featureNames)), y = rfr_test.feature_importances_)
b.set_xticklabels(featureNames, rotation = 90)
b.set_ylabel('Feature importance')
b.set_xlabel('Feature name')
plt.show()



In [224]:
# how does it do against the first game between alabama and Kennesaw St. (if train set is 2016)

predictedScores = rfr_test.subPredictions([samples[0], samples[1]])
print(np.mean(predictedScores[0]), target[0])
print(np.mean(predictedScores[1]), target[1])
sns.distplot(predictedScores[0], bins = 30)
sns.distplot(predictedScores[1], bins = 30)
plt.show()


77.3810791615 74.2693992491
66.1492622236 66.7306007509

In [221]:
# not bad, but it looks like there are some outliers -- what happens if we take inner 90 percent?
# looks a little better

outlierThresh = 5
predictedScoresRemovedOutliers = trimQuartiles(predictedScores[0], outlierThresh)
print(np.mean(predictedScoresRemovedOutliers), target[0])


76.8948576083 74.2693992491

In [222]:
# what is the probability Alabama wins on a neutral court? Include random tie breaker since RF sometimes gives ties
# note this season Alabama when 18-15, kneesaw went 11-20 -- adj margin diff was 16 pts between two teams
# surely favors Alabama, but seems within reason that ~15% of the time kneesaw would win

psHumbled = np.array([ np.array(trimQuartiles(p, outlierThresh)) for p in predictedScores ])
psTieBreaker = 1e-6*np.random.rand(*psHumbled.shape) + psHumbled
probabilityGreaterThan(psTieBreaker[0], psTieBreaker[1])


Out[222]:
0.81242244261937913

In [244]:
# what if I train to one season and predict in another? Looks pretty good!

otherSeason = 2015

otherSeasonGame = allSeasonGames[ allSeasonGames['Season'] == otherSeason ][:1]
otherKenPomData = allKenPomData[ allKenPomData['Season'] == otherSeason ]
otherSamples =\
    [ featureGeneration(otherKenPomData, otherSeasonGame['Wteam'].values[0], otherSeasonGame['Lteam'].values[0]),
      featureGeneration(otherKenPomData, otherSeasonGame['Lteam'].values[0], otherSeasonGame['Wteam'].values[0]) ]
otherPredictedScores = rfr_test.subPredictions(otherSamples)

print(np.mean(otherPredictedScores[0]), otherSeasonGame['Wscore'].values[0] - homeAwayCorrection/2.0)
print(np.mean(otherPredictedScores[1]), otherSeasonGame['Lscore'].values[0] + homeAwayCorrection/2.0)
sns.distplot(otherPredictedScores[0], bins = 30)
sns.distplot(otherPredictedScores[1], bins = 30)
plt.show()


71.1067278786 71.2693992491
57.0013020174 59.7306007509

In [245]:
psHumbled = np.array([ np.array(trimQuartiles(p, outlierThresh)) for p in otherPredictedScores ])
psTieBreaker = 1e-6*np.random.rand(*psHumbled.shape) + psHumbled
probabilityGreaterThan(psTieBreaker[0], psTieBreaker[1])


Out[245]:
0.87520802741067061

Lessons from the case study

  1. Will have to be extremely careful with overtraining. Likely have to remove all march madness matchups from train set or train to previous seasons.
  2. Have room to find additional important features; everything chosen seems to matter
  3. This method is messy. It might be better to go with native model to classify win vs lose and rethink parametrization

For now, forge ahead with using previous season to train to prevent over-fitting. So, use 2015 regular season to preduct 2016 post-season.


In [41]:
# filter to target season

trainSeason, testSeason = 2014, 2015

trainSeasonGames = allSeasonGames[ allSeasonGames['Season'] == trainSeason ]
trainTourneyGames = allTourneyGames[ allTourneyGames['Season'] == trainSeason ]
trainTourneySeeds = allTourneySeeds[ allTourneySeeds['Season'] == trainSeason ]
trainKenPomData = allKenPomData[ allKenPomData['Season'] == trainSeason ]

testSeasonGames = allSeasonGames[ allSeasonGames['Season'] == testSeason ]
testTourneyGames = allTourneyGames[ allTourneyGames['Season'] == testSeason ]
testTourneySeeds = allTourneySeeds[ allTourneySeeds['Season'] == testSeason ]
testKenPomData = allKenPomData[ allKenPomData['Season'] == testSeason ]

In [42]:
# points correction factors

homeAwayCorrection = avgHomeAwayPointDiff(trainSeasonGames)

In [43]:
# build up training set
# note this is a bit slow so perhaps use map to speed things up?

samples, target = generateTrainingSet(trainSeasonGames, trainKenPomData, homeAwayCorrection)

In [44]:
# train our random forest 
# note: removing first two samples to avoid over-fitting when 

rfr = RandomForestRegressorAugmented(n_estimators = 500)
rfr.fit(samples, target)


Out[44]:
RandomForestRegressorAugmented(bootstrap=True, criterion='mse',
                max_depth=None, max_features='auto', max_leaf_nodes=None,
                min_impurity_split=1e-07, min_samples_leaf=1,
                min_samples_split=2, min_weight_fraction_leaf=0.0,
                n_estimators=500, n_jobs=1, oob_score=False,
                random_state=None, verbose=0, warm_start=False)

In [53]:
# predictions for all matchups in playoffs 

tourneyTeams = list(set(testTourneySeeds['Team']))

predictions, keys = [], []
for team1 in tourneyTeams:
    for team2 in tourneyTeams:
        if team1 < team2:
            team1Sample = featureGeneration(kenPomData, team1, team2)
            team2Sample = featureGeneration(kenPomData, team2, team1)
            scores = rfr.subPredictions([ team1Sample, team2Sample ])
            # probabilities
            psHumbled = np.array([ np.array(trimQuartiles(s, outlierThresh)) for s in scores ])
            psTieBreaker = 1e-8*np.random.rand(*psHumbled.shape) + psHumbled
            #psTieBreaker = 1e-8*np.random.rand(*scores.shape) + scores
            winProbabiity = probabilityGreaterThan(psTieBreaker[0], psTieBreaker[1])
            # capture
            keys += [ '_'.join(map(str, [testSeason, team1, team2])) ]
            predictions += [ winProbabiity ]

predictions = pd.DataFrame({ 'Id' : keys, 'Pred' : predictions}).set_index('Id')

In [57]:
# check the performance

results = testTourneyGames.copy()[['Wteam', 'Lteam']]
results['minTeam'] = np.where(results['Wteam'] < results['Lteam'], results['Wteam'], results['Lteam'])
results['maxTeam'] = np.where(results['Wteam'] > results['Lteam'], results['Wteam'], results['Lteam'])
results['Win'] = np.where(results['Wteam'] == results['minTeam'], 1, 0)
results['Id'] = str(season) + '_' + results['minTeam'].map(str).str.cat(results['maxTeam'].map(str), sep = '_')
results = results[['Id', 'Win']].set_index('Id')

resultsVsReal = results.join(predictions)

log_loss(resultsVsReal['Win'], resultsVsReal['Pred'])


Out[57]:
0.54976137314525475

In [55]:
print(np.array(resultsVsReal))


[[ 1.          0.28204473]
 [ 0.          0.37302928]
 [ 0.          0.42645504]
 [ 0.          0.58826667]
 [ 1.          0.9985891 ]
 [ 1.          0.59950053]
 [ 1.          0.52606542]
 [ 1.          0.39146963]
 [ 0.          0.1825302 ]
 [ 0.          0.79869597]
 [ 0.          0.01129551]
 [ 0.          0.61519981]
 [ 0.          0.22562928]
 [ 0.          0.17355408]
 [ 1.          0.66457927]
 [ 0.          0.72608524]
 [ 0.          0.68818865]
 [ 0.          0.22785966]
 [ 0.          0.00782292]
 [ 0.          0.4611173 ]
 [ 1.          0.4723253 ]
 [ 1.          0.92102451]
 [ 1.          0.91753387]
 [ 0.          0.60787938]
 [ 1.          0.86273194]
 [ 1.          0.81732648]
 [ 1.          0.60576419]
 [ 0.          0.46087585]
 [ 1.          0.83791928]
 [ 0.          0.05034887]
 [ 0.          0.71254984]
 [ 1.          0.46722222]
 [ 0.          0.17451152]
 [ 0.          0.19198037]
 [ 0.          0.25942731]
 [ 0.          0.13936548]
 [ 1.          0.67546063]
 [ 0.          0.15089809]
 [ 1.          0.10955114]
 [ 0.          0.30811569]
 [ 0.          0.35026432]
 [ 0.          0.26742016]
 [ 0.          0.51070519]
 [ 0.          0.47508907]
 [ 1.          0.54576205]
 [ 1.          0.69595283]
 [ 1.          0.56288138]
 [ 1.          0.35622168]
 [ 0.          0.38116985]
 [ 0.          0.46151709]
 [ 0.          0.66183825]
 [ 0.          0.27445228]
 [ 1.          0.8610814 ]
 [ 1.          0.62077863]
 [ 1.          0.6502774 ]
 [ 0.          0.3222126 ]
 [ 1.          0.54487029]
 [ 1.          0.84172226]
 [ 1.          0.81924444]
 [ 1.          0.43551111]
 [ 1.          0.79552894]
 [ 0.          0.61945947]
 [ 1.          0.62490822]
 [ 0.          0.55217979]
 [ 1.          0.80457875]
 [ 0.          0.74732328]
 [ 1.          0.57922222]]

In [40]:
# submit at your own risk
# this blew up on 2016, seems susceptible to upsets

predictions[['Pred']].to_csv('submissions/rfRegressorTest.csv')