In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from geopy.distance import great_circle as geodist
import sklearn.linear_model as sklm
import sklearn.preprocessing as skpp
import sklearn.metrics as skm
import sklearn.model_selection as skms
In [2]:
# load data
reg_season = pd.read_csv('data/RegularSeasonDetailedResults.csv')
tourney = pd.read_csv('data/TourneyDetailedResults.csv')
ratings = pd.read_csv('data/addl/massey_ordinals_2003-2016.csv')
team_geog = pd.read_csv('data/addl/TeamGeog.csv')
tourney_geog = pd.read_csv('data/addl/TourneyGeog_Thru2016.csv')
tourney_slots = pd.read_csv('data/TourneySlots.csv')
tourney_seeds = pd.read_csv('data/TourneySeeds.csv')
kenpom = pd.read_csv('data/kenPomTeamData.csv')
teams = pd.read_csv('data/Teams.csv')
Use published rankings together with distance traveled to play to classify winners + losers
Train to regular season and test on post season
considerations:
In [3]:
def attach_ratings_diff_stats(df, ratings_eos, season):
out_cols = list(df.columns) + ['mean_rtg_1', 'std_rtg_1', 'num_rtg_1', 'mean_rtg_2', 'std_rtg_2', 'num_rtg_2']
rtg_1 = ratings_eos.rename(columns = {'mean_rtg' : 'mean_rtg_1', 'std_rtg' : 'std_rtg_1', 'num_rtg' : 'num_rtg_1'})
rtg_2 = ratings_eos.rename(columns = {'mean_rtg' : 'mean_rtg_2', 'std_rtg' : 'std_rtg_2', 'num_rtg' : 'num_rtg_2'})
return df\
.merge(rtg_1, left_on = ['Season', 'Team1'], right_on = ['season', 'team'])\
.merge(rtg_2, left_on = ['Season', 'Team2'], right_on = ['season', 'team'])\
[out_cols]
def get_eos_ratings(ratings):
ratings_last_day = ratings.groupby('season').aggregate(max)[['rating_day_num']].reset_index()
ratings_eos_all = ratings_last_day\
.merge(ratings, left_on = ['season', 'rating_day_num'], right_on = ['season', 'rating_day_num'])
ratings_eos = ratings_eos_all.groupby(['season', 'team']).aggregate([np.mean, np.std, len])['orank']
return ratings_eos.reset_index().rename(columns = {'mean' : 'mean_rtg', 'std' : 'std_rtg', 'len' : 'num_rtg'})
In [20]:
def get_score_fluctuation(reg_season, season):
# note: quick and dirty; not best practice for home / away etc b/c these would only improve est for
# std on second order
# scale the score spreads by # posessions
# note: units don't really matter because this is used in a ratio and is normalized later
rsc = reg_season[reg_season['Season'] == season].copy()
# avg home vs away
hscores = rsc[rsc['Wloc'] == 'H']['Wscore'].tolist() + rsc[rsc['Wloc'] == 'A']['Lscore'].tolist()
ascores = rsc[rsc['Wloc'] == 'A']['Wscore'].tolist() + rsc[rsc['Wloc'] == 'H']['Lscore'].tolist()
home_correction = np.mean(hscores) - np.mean(ascores)
# get posessions per game
posessions = 0.5 * (
rsc['Lfga'] - rsc['Lor'] + rsc['Lto'] + 0.475*rsc['Lfta'] +\
rsc['Wfga'] - rsc['Wor'] + rsc['Wto'] + 0.475*rsc['Wfta']
)
# get victory margins and correct for home / away -- scale for posessions
rsc['win_mgn'] = rsc['Wscore'] - rsc['Lscore']
rsc['win_mgn'] += np.where(rsc['Wloc'] == 'H', -home_correction, 0)
rsc['win_mgn'] += np.where(rsc['Wloc'] == 'A', home_correction, 0)
rsc['win_mgn_scaled'] = rsc['win_mgn'] * 100 / posessions # score per 100 posessions
# get mgn of victory stats per team
win_mgns_wins = rsc[['Wteam', 'win_mgn_scaled']].rename(columns = {'Wteam' : 'team', 'win_mgn_scaled' : 'mgn'})
win_mgns_losses = rsc[['Lteam', 'win_mgn_scaled']].rename(columns = {'Lteam' : 'team', 'win_mgn_scaled' : 'mgn'})
win_mgns_losses['mgn'] *= -1
win_mgns = pd.concat([win_mgns_wins, win_mgns_losses])
return win_mgns.groupby('team').aggregate(np.std).rename(columns = {'mgn' : 'std_mgn'}).reset_index()
def attach_score_fluctuations(df, reg_season, season):
cols_to_keep = list(df.columns) + ['std_mgn_1', 'std_mgn_2']
fluct = get_score_fluctuation(reg_season, season)
fluct1 = fluct.rename(columns = {'std_mgn' : 'std_mgn_1'})
fluct2 = fluct.rename(columns = {'std_mgn' : 'std_mgn_2'})
return df\
.merge(fluct1, left_on = 'Team1', right_on = 'team')\
.merge(fluct2, left_on = 'Team2', right_on = 'team')[cols_to_keep]
In [5]:
def attach_kenpom_stats(df, kenpom, season):
cols_to_keep = list(df.columns) + ['adjem_1', 'adjem_2', 'adjt_1', 'adjt_2']
kp1 = kenpom[kenpom['Season'] == season][['Team_Id', 'AdjEM', 'AdjTempo']]\
.rename(columns = {'AdjEM' : 'adjem_1', 'AdjTempo' : 'adjt_1'})
kp2 = kenpom[kenpom['Season'] == season][['Team_Id', 'AdjEM', 'AdjTempo']]\
.rename(columns = {'AdjEM' : 'adjem_2', 'AdjTempo' : 'adjt_2'})
return df\
.merge(kp1, left_on = 'Team1', right_on = 'Team_Id')\
.merge(kp2, left_on = 'Team2', right_on = 'Team_Id')[cols_to_keep]
In [6]:
def get_root_and_leaves(hierarchy):
all_children = set(hierarchy[['Strongseed', 'Weakseed']].values.flatten())
all_parents = set(hierarchy[['Slot']].values.flatten())
root = [ p for p in all_parents if p not in all_children ][0]
leaves = [ c for c in all_children if c not in all_parents ]
return root, leaves
def get_tourney_tree_one_season(tourney_slots, season):
def calculate_depths(tree, child, root):
if child == root:
return 0
elif tree[child]['depth'] < 0:
tree[child]['depth'] = 1 + calculate_depths(tree, tree[child]['parent'], root)
return tree[child]['depth']
hierarchy = tourney_slots[tourney_slots['Season'] == season][['Slot', 'Strongseed', 'Weakseed']]
root, leaves = get_root_and_leaves(hierarchy) # should be R6CH...
tree_raw = {**dict(zip(hierarchy['Strongseed'],hierarchy['Slot'])),
**dict(zip(hierarchy['Weakseed'],hierarchy['Slot']))}
tree = { c : {'parent' : tree_raw[c], 'depth' : -1} for c in tree_raw}
for c in leaves:
calculate_depths(tree, c, root)
return tree
def get_tourney_trees(tourney_slots):
return { season : get_tourney_tree_one_season(tourney_slots, season)\
for season in tourney_slots['Season'].unique() }
def slot_matchup_from_seed(tree, seed1, seed2):
# return which slot the two teams would face off in
if seed1 == seed2:
return seed1
next_seed1 = seed1 if tree[seed1]['depth'] < tree[seed2]['depth'] else tree[seed1]['parent']
next_seed2 = seed2 if tree[seed2]['depth'] < tree[seed1]['depth'] else tree[seed2]['parent']
return slot_matchup_from_seed(tree, next_seed1, next_seed2)
def get_team_seed(tourney_seeds, season, team):
seed = tourney_seeds[
(tourney_seeds['Team'] == team) &
(tourney_seeds['Season'] == season)
]['Seed'].values
if len(seed) == 1:
return seed[0]
else:
return None
In [7]:
def dist(play_lat, play_lng, lat, lng):
return geodist((play_lat, play_lng), (lat, lng)).miles
def reg_distance_to_game(games_in, team_geog):
games = games_in.copy()
out_cols = list(games.columns) + ['w_dist', 'l_dist']
w_geog = team_geog.rename(columns = {'lat' : 'w_lat', 'lng' : 'w_lng'})
l_geog = team_geog.rename(columns = {'lat' : 'l_lat', 'lng' : 'l_lng'})
games = games\
.merge(w_geog, left_on = 'Wteam', right_on = 'team_id')\
.merge(l_geog, left_on = 'Lteam', right_on = 'team_id')
# handle neutral locations later by averaging distance from home for 2 teams if neutral location
games['play_lat'] = np.where(games['Wloc'] == 'H', games['w_lat'], games['l_lat'])
games['play_lng'] = np.where(games['Wloc'] == 'H', games['w_lng'], games['l_lng'])
games['w_dist'] = games.apply(lambda x: dist(x['play_lat'], x['play_lng'], x['w_lat'], x['w_lng']), axis = 1)
games['l_dist'] = games.apply(lambda x: dist(x['play_lat'], x['play_lng'], x['l_lat'], x['l_lng']), axis = 1)
# correct for neutral
games['w_dist'], games['l_dist'] =\
np.where(games['Wloc'] == 'N', (games['w_dist'] + games['l_dist'])/2, games['w_dist']),\
np.where(games['Wloc'] == 'N', (games['w_dist'] + games['l_dist'])/2, games['l_dist'])
return games[out_cols]
def tourney_distance_to_game(tourney_raw_in, tourney_geog, team_geog, season):
out_cols = list(tourney_raw_in.columns) + ['dist_1', 'dist_2']
tourney_raw = tourney_raw_in.copy()
geog_1 = team_geog.rename(columns = {'lat' : 'lat_1', 'lng' : 'lng_1'})
geog_2 = team_geog.rename(columns = {'lat' : 'lat_2', 'lng' : 'lng_2'})
geog_play = tourney_geog[tourney_geog['season'] == season][['slot', 'lat', 'lng']]\
.rename(columns = {'lat' : 'lat_p', 'lng' : 'lng_p'})
tourney_raw = tourney_raw\
.merge(geog_1, left_on = 'Team1', right_on = 'team_id')\
.merge(geog_2, left_on = 'Team2', right_on = 'team_id')\
.merge(geog_play, left_on = 'SlotMatchup', right_on = 'slot')
tourney_raw['dist_1'] = tourney_raw.apply(lambda x: dist(x['lat_p'], x['lng_p'], x['lat_1'], x['lng_1']), axis = 1)
tourney_raw['dist_2'] = tourney_raw.apply(lambda x: dist(x['lat_p'], x['lng_p'], x['lat_2'], x['lng_2']), axis = 1)
return tourney_raw[out_cols]
In [8]:
def get_raw_reg_season_data(reg_season, team_geog, season):
cols_to_keep = ['Season', 'Daynum', 'Team1', 'Team2', 'score_1', 'score_2', 'dist_1', 'dist_2']
rsr = reg_season[reg_season['Season'] == season] # reg season raw
rsr = reg_distance_to_game(rsr, team_geog)
rsr['Team1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['Wteam'], rsr['Lteam'])
rsr['Team2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['Wteam'], rsr['Lteam'])
rsr['score_1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['Wscore'], rsr['Lscore'])
rsr['score_2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['Wscore'], rsr['Lscore'])
rsr['dist_1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['w_dist'], rsr['l_dist'])
rsr['dist_2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['w_dist'], rsr['l_dist'])
return rsr[cols_to_keep]
def get_raw_tourney_data(tourney_seeds, tourney_trees, tourney_geog, team_geog, season):
# tree to find play location
tree = tourney_trees[season]
# get all teams in tourney
seed_map = tourney_seeds[tourney_seeds['Season'] == season].set_index('Team').to_dict()['Seed']
teams = sorted(seed_map.keys())
team_pairs = sorted([ (team1, team2) for team1 in teams for team2 in teams if team1 < team2 ])
tourney_raw = pd.DataFrame(team_pairs).rename(columns = { 0 : 'Team1', 1 : 'Team2' })
tourney_raw['Season'] = season
# find out where they would play each other
tourney_raw['SlotMatchup'] = tourney_raw.apply(
lambda x: slot_matchup_from_seed(tree, seed_map[x['Team1']], seed_map[x['Team2']]), axis = 1
)
# get features
tourney_raw = tourney_distance_to_game(tourney_raw, tourney_geog, team_geog, season)
return tourney_raw
def attach_supplements(data, reg_season, kenpom, ratings_eos, season):
dc = data.copy()
dc = attach_ratings_diff_stats(dc, ratings_eos, season) # get ratings diff stats
dc = attach_kenpom_stats(dc, kenpom, season)
dc = attach_score_fluctuations(dc, reg_season, season)
return dc
In [9]:
def generate_features(df):
has_score = 'score_1' in df.columns and 'score_2' in df.columns
cols_to_keep = ['Team1', 'Team2', 'Season', 'ln_dist_diff', 'rtg_diff', 't_rtg', 'pt_diff', 't_score'] +\
(['Team1_win'] if has_score else [])
features = df.copy()
features['ln_dist_diff'] = np.log((1 + df['dist_1'])/(1 + df['dist_2']))
# use negative for t_rtg so that better team has higher statistic than worse team
features['rtg_diff'] = -(df['mean_rtg_1'] - df['mean_rtg_2'])
features['t_rtg'] = -(df['mean_rtg_1'] - df['mean_rtg_2']) / np.sqrt(df['std_rtg_1']**2 + df['std_rtg_2']**2)
features['pt_diff'] = df['adjem_1'] - df['adjem_2']
features['t_score'] = (df['adjem_1'] - df['adjem_2']) / np.sqrt(df['std_mgn_1']**2 + df['std_mgn_2']**2)
# truth feature: did team 1 win?
if has_score:
features['Team1_win'] = features['score_1'] > features['score_2']
return features[cols_to_keep]
def normalize_features(train, test, features):
all_data_raw = pd.concat([train[features], test[features]])
all_data_norm = skpp.scale(all_data_raw) # with_mean = False ?
train_norm = train.copy()
test_norm = test.copy()
train_norm[features] = all_data_norm[:len(train)]
test_norm[features] = all_data_norm[len(train):]
return train_norm, test_norm
In [10]:
def get_key(df):
return df['Season'].map(str) + '_' + df['Team1'].map(str) + '_' + df['Team2'].map(str)
In [21]:
features_to_use = ['ln_dist_diff', 'rtg_diff', 't_rtg', 'pt_diff', 't_score']
predict_field = 'Team1_win'
def get_features(season, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog):
# support data
tourney_trees = get_tourney_trees(tourney_slots)
ratings_eos = get_eos_ratings(ratings)
# regular season cleaned data
regular_raw = get_raw_reg_season_data(reg_season, team_geog, season)
regular_raw = attach_supplements(regular_raw, reg_season, kenpom, ratings_eos, season)
# post season cleaned data
tourney_raw = get_raw_tourney_data(tourney_seeds, tourney_trees, tourney_geog, team_geog, season)
tourney_raw = attach_supplements(tourney_raw, reg_season, kenpom, ratings_eos, season)
# get and normalize features
feat_train = generate_features(regular_raw)
feat_test = generate_features(tourney_raw)
train_norm, test_norm = normalize_features(feat_train, feat_test, features_to_use)
return regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm
def make_predictions(season, train_norm, test_norm, tourney, C = 1):
# fit
lr = sklm.LogisticRegression(C = C) # fit_intercept = False???
lr.fit(train_norm[features_to_use].values, train_norm[predict_field].values)
# predictions
probs = lr.predict_proba(test_norm[features_to_use].values)
keys = get_key(test_norm)
predictions = pd.DataFrame({'Id' : keys.values, 'Pred' : probs[:,1]})
# Evaluate outcomes
res_base = tourney[(tourney['Season'] == season) & (tourney['Daynum'] > 135)].copy().reset_index()
res_base['Team1'] = np.where(res_base['Wteam'] < res_base['Lteam'], res_base['Wteam'], res_base['Lteam'])
res_base['Team2'] = np.where(res_base['Wteam'] > res_base['Lteam'], res_base['Wteam'], res_base['Lteam'])
res_base['Result'] = (res_base['Wteam'] == res_base['Team1']).map(lambda x: 1 if x else 0)
res_base['Id'] = get_key(res_base)
# attach results to predictions
res = pd.merge(res_base[['Id', 'Result']], predictions, on = 'Id', how = 'left')
# logloss
ll = skm.log_loss(res['Result'], res['Pred'])
# print(lr.intercept_)
# print(lr.coef_)
return predictions, res, ll
In [28]:
all_predictions = []
for season in [2013, 2014, 2015, 2016]:
regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm = \
get_features(season, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog)
# see below for choice of C
predictions, res, ll = make_predictions(season, train_norm, test_norm, tourney, C = 5e-3)
print(ll)
all_predictions += [predictions]
In [ ]:
# 0.559078513104 -- 2013
# 0.541984791608 -- 2014
# 0.480356337664 -- 2015
# 0.511671826092 -- 2016
In [477]:
pd.concat(all_predictions).to_csv('./submissions/simpleLogisticModel2013to2016_tuned.csv', index = False)
In [632]:
sns.pairplot(train_norm, hue = predict, vars = ['ln_dist_diff', 'rtg_diff', 't_rtg', 'pt_diff', 't_score'])
plt.show()
In [309]:
teams[teams['Team_Id'].isin([1163, 1196])]
Out[309]:
In [310]:
tourney_raw[(tourney_raw['Team1'] == 1163) & (tourney_raw['Team2'] == 1196)]
Out[310]:
In [311]:
feat_test[(feat_test['Team1'] == 1195) & (feat_test['Team2'] == 1196)]
Out[311]:
In [571]:
res.ix[np.argsort(-(res['Pred'] - res['Result']).abs())].reset_index(drop = True)
Out[571]:
In [287]:
# accuracy?
np.sum(np.where(res['Pred'] > 0.5, res['Result'] == 1, res['Result'] == 0)) / len(res)
Out[287]:
In [23]:
cs_to_check = np.power(10, np.arange(-4, 2, 0.1))
years_to_check = range(2011, 2017)
c_effect_df_dict = { 'C' : cs_to_check }
for yr in years_to_check:
regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm = \
get_features(yr, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog)
log_losses = [ make_predictions(yr, train_norm, test_norm, tourney, C = C)[2] for C in cs_to_check ]
c_effect_df_dict[str(yr)] = log_losses
c_effect = pd.DataFrame(c_effect_df_dict)
In [24]:
plt.semilogx()
for col in [ col for col in c_effect if col != 'C' ]:
plt.plot(c_effect['C'], c_effect[col])
plt.legend(loc = 3)
plt.xlabel('C')
plt.ylabel('logloss')
plt.ylim(0.45, 0.65)
plt.show()
In [25]:
# contribution to logloss
rc = res.copy()
ftc = feat_test.copy()
ftc['Id'] = get_key(ftc)
rc['logloss_contrib'] = -np.log(np.where(rc['Result'] == 1, rc['Pred'], 1 - rc['Pred'])) / len(rc)
ftc = pd.merge(rc, ftc, how = 'left', on = 'Id')
fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (10, 4))
im = axes[0].scatter(ftc['t_score'], ftc['t_rtg'], c = ftc['logloss_contrib'], vmin = 0, vmax = 0.025, cmap = plt.cm.get_cmap('coolwarm'))
axes[0].set_xlabel('t_score')
axes[0].set_ylabel('t_rtg')
#plt.colorbar(sc)
axes[1].scatter(-ftc['ln_dist_diff'], ftc['t_rtg'], c = ftc['logloss_contrib'], vmin = 0, vmax = 0.025, cmap = plt.cm.get_cmap('coolwarm'))
axes[1].set_xlabel('ln_dist_diff')
cb = fig.colorbar(im, ax=axes.ravel().tolist(), label = 'logloss_contrib')
plt.show()
In [26]:
tourney_rounds = tourney_raw[['Team1', 'Team2', 'Season', 'SlotMatchup']].copy()
tourney_rounds['Id'] = get_key(tourney_rounds)
tourney_rounds['round'] = tourney_rounds['SlotMatchup'].map(lambda s: int(s[1]))
tourney_rounds = tourney_rounds[['Id', 'round']]
ftc_with_rounds = pd.merge(ftc, tourney_rounds, how = 'left', on = 'Id')
fig, axs = plt.subplots(ncols=2, figsize = (10, 4))
sns.barplot(data = ftc_with_rounds, x = 'round', y = 'logloss_contrib', errwidth = 0, ax = axs[0])
sns.barplot(data = ftc_with_rounds, x = 'round', y = 'logloss_contrib', errwidth = 0, estimator=max, ax = axs[1])
axs[0].set_ylim(0, 0.035)
axs[1].set_ylim(0, 0.035)
plt.show()
In [398]:
sns.barplot(data = reg_season[reg_season['Season'] > 2000], x = 'Season', y = 'Numot', errwidth = 0)
plt.show()
In [321]:
sns.lmplot('mean_rtg', 'std_rtg', data = ratings_eos, fit_reg = False)
plt.show()
In [353]:
ratings_eos_test = ratings_eos.copy()
ratings_eos_test['parabola_mean_model'] =(ratings_eos_test['mean_rtg'].max()/2)**2-(ratings_eos_test['mean_rtg'] - ratings_eos_test['mean_rtg'].max()/2)**2
sns.lmplot('parabola_mean_model', 'std_rtg', data = ratings_eos_test, fit_reg = False)
plt.show()
In [352]:
test_data_test = test_data.copy()
test_data_test['rtg_diff'] = test_data_test['mean_rtg_1'] - test_data_test['mean_rtg_2']
test_data_test['t_model'] = test_data_test['rtg_diff']/(test_data_test['std_rtg_1']**2 + test_data_test['std_rtg_2']**2)**0.5
#sns.lmplot('rtg_diff', 't_model', data = test_data_test, fit_reg = False)
sns.pairplot(test_data_test[['rtg_diff', 't_model']])
plt.show()
In [247]:
dist_test = get_training_data(reg_season, team_geog, 2016)
w_dist_test = dist_test[['w_dist', 'Wscore']].rename(columns = {'w_dist' : 'dist', 'Wscore' : 'score'})
l_dist_test = dist_test[['l_dist', 'Lscore']].rename(columns = {'l_dist' : 'dist', 'Lscore' : 'score'})
dist_test = pd.concat([w_dist_test, l_dist_test]).reset_index()[['dist', 'score']]
In [270]:
plt.hist(dist_test['dist'])
plt.xlim(0, 3000)
plt.semilogy()
plt.show()
In [324]:
bucket_size = 1
dist_test['bucket'] = bucket_size * (np.log(dist_test['dist'] + 1) // bucket_size)
dist_grp = dist_test.groupby('bucket').aggregate([np.mean, np.std, len])['score']
dist_grp['err'] = dist_grp['std'] / np.sqrt(dist_grp['len'])
In [325]:
plt.plot(dist_grp['mean'])
plt.fill_between(dist_grp.index,
(dist_grp['mean'] - 2*dist_grp['err']).values,
(dist_grp['mean'] + 2*dist_grp['err']).values,
alpha = 0.3)
plt.xlabel('log of distance traveled')
plt.ylabel('avg score')
plt.show()