In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt
from geopy.distance import great_circle as geodist
import sklearn.linear_model as sklm
import sklearn.preprocessing as skpp
import sklearn.metrics as skm
import sklearn.model_selection as skms
In [2]:
# load data
reg_season = pd.read_csv('data/RegularSeasonDetailedResults.csv')
tourney = pd.read_csv('data/TourneyDetailedResults.csv')
team_geog = pd.read_csv('data/addl/TeamGeog.csv')
tourney_geog = pd.read_csv('data/addl/TourneyGeog.csv')
tourney_slots = pd.read_csv('data/TourneySlots.csv')
tourney_seeds = pd.read_csv('data/TourneySeeds.csv')
kenpom = pd.read_csv('data/kenPomTeamData.csv')
teams = pd.read_csv('data/Teams.csv')
spreads = pd.read_csv('data/addl/point_spreads.csv')
ratings_pre2017 = pd.read_csv('data/addl/massey_ordinals_2003-2016.csv')
ratings_2017 = pd.read_csv('data/addl/MasseyOrdinals_2017_ThruDay133_68systems.csv')
ratings = pd.concat([ratings_pre2017, ratings_2017])
Use published rankings together with distance traveled to play to classify winners + losers
Train to regular season and test on post season
TWEAKS TO TRY
considerations:
In [3]:
def normalize_ratings(some_rater):
cache = {}
def get_eff_rank(rank, max_rank):
if (rank, max_rank) not in cache:
percentile = 1 - int(rank) / (max_rank + 1)
cache[(rank, max_rank)] = ss.norm.ppf(percentile)
return cache[(rank, max_rank)]
sr = some_rater.copy()
num_per_season = sr.groupby('season').max()['orank'].to_dict()
sr['eff_rtg'] = sr.apply(lambda x: get_eff_rank(x['orank'], num_per_season[x['season']]), axis = 1)
return sr
In [4]:
def attach_ratings_diff_stats(df, ratings_eos, season):
out_cols = list(df.columns) + ['mean_rtg_1', 'std_rtg_1', 'num_rtg_1', 'mean_rtg_2', 'std_rtg_2', 'num_rtg_2']
rtg_1 = ratings_eos.rename(columns = {'mean_rtg' : 'mean_rtg_1', 'std_rtg' : 'std_rtg_1', 'num_rtg' : 'num_rtg_1'})
rtg_2 = ratings_eos.rename(columns = {'mean_rtg' : 'mean_rtg_2', 'std_rtg' : 'std_rtg_2', 'num_rtg' : 'num_rtg_2'})
return df\
.merge(rtg_1, left_on = ['Season', 'Team1'], right_on = ['season', 'team'])\
.merge(rtg_2, left_on = ['Season', 'Team2'], right_on = ['season', 'team'])\
[out_cols]
def get_eos_ratings(ratings):
ratings_last_day = ratings.groupby('season').aggregate(max)[['rating_day_num']].reset_index()
ratings_eos_all = ratings_last_day\
.merge(ratings, left_on = ['season', 'rating_day_num'], right_on = ['season', 'rating_day_num'])
ratings_eos_all = normalize_ratings(ratings_eos_all)
ratings_eos = ratings_eos_all.groupby(['season', 'team']).aggregate([np.mean, np.std, len])['eff_rtg']
return ratings_eos.reset_index().rename(columns = {'mean' : 'mean_rtg', 'std' : 'std_rtg', 'len' : 'num_rtg'})
In [5]:
def attach_ratings_by_day_diff_stats(df, ratings_by_day, season):
out_cols = list(df.columns) + ['mean_rtg_1', 'std_rtg_1', 'num_rtg_1', 'mean_rtg_2', 'std_rtg_2', 'num_rtg_2']
rtg_1 = ratings_by_day.rename(columns = {'mean_rtg' : 'mean_rtg_1', 'std_rtg' : 'std_rtg_1', 'num_rtg' : 'num_rtg_1'})
rtg_2 = ratings_by_day.rename(columns = {'mean_rtg' : 'mean_rtg_2', 'std_rtg' : 'std_rtg_2', 'num_rtg' : 'num_rtg_2'})
return df\
.merge(rtg_1, left_on = ['Season', 'Daynum', 'Team1'], right_on = ['season', 'rating_day_num', 'team'])\
.merge(rtg_2, left_on = ['Season', 'Daynum', 'Team2'], right_on = ['season', 'rating_day_num', 'team'])\
[out_cols]
def get_ratings_by_day(ratings):
# find closest rating, drag forward (i.e. most recent rating) -- then drag up
rc = ratings.copy()
rc = normalize_ratings(rc)
ratings_by_day_stats = rc.groupby(['season', 'rating_day_num', 'team'])\
.aggregate([np.mean, np.std, len])['eff_rtg']\
.rename(columns = {'mean' : 'mean_rtg', 'std' : 'std_rtg', 'len' : 'num_rtg'})\
.reset_index()
all_teams = rc['team'].unique()
all_day_teams = pd.DataFrame(pd.DataFrame(
[ (year, day, team) for year in range(2000, rc['season'].max()+1)\
for day in range(200)\
for team in all_teams
])
).rename(columns = {0 : 'season', 1 : 'rating_day_num', 2 : 'team'})
ratings_by_day_holes = pd.merge(all_day_teams, ratings_by_day_stats,
on = ['season', 'rating_day_num', 'team'],
how = 'left')
ratings_by_day = ratings_by_day_holes.sort_values(['season', 'rating_day_num'])\
.groupby(['season', 'team']).ffill()\
.groupby(['season', 'team']).bfill()
return ratings_by_day
In [6]:
def get_score_fluctuation(reg_season, season):
# note: quick and dirty; not best practice for home / away etc b/c these would only improve est for
# std on second order
# scale the score spreads by # posessions
# note: units don't really matter because this is used in a ratio and is normalized later
rsc = reg_season[reg_season['Season'] == season].copy()
# avg home vs away
hscores = rsc[rsc['Wloc'] == 'H']['Wscore'].tolist() + rsc[rsc['Wloc'] == 'A']['Lscore'].tolist()
ascores = rsc[rsc['Wloc'] == 'A']['Wscore'].tolist() + rsc[rsc['Wloc'] == 'H']['Lscore'].tolist()
home_correction = np.mean(hscores) - np.mean(ascores)
# get posessions per game
posessions = 0.5 * (
rsc['Lfga'] - rsc['Lor'] + rsc['Lto'] + 0.475*rsc['Lfta'] +\
rsc['Wfga'] - rsc['Wor'] + rsc['Wto'] + 0.475*rsc['Wfta']
)
# get victory margins and correct for home / away -- scale for posessions
rsc['win_mgn'] = rsc['Wscore'] - rsc['Lscore']
rsc['win_mgn'] += np.where(rsc['Wloc'] == 'H', -home_correction, 0)
rsc['win_mgn'] += np.where(rsc['Wloc'] == 'A', home_correction, 0)
rsc['win_mgn_scaled'] = rsc['win_mgn'] * 100 / posessions # score per 100 posessions
# get mgn of victory stats per team
win_mgns_wins = rsc[['Wteam', 'win_mgn_scaled']].rename(columns = {'Wteam' : 'team', 'win_mgn_scaled' : 'mgn'})
win_mgns_losses = rsc[['Lteam', 'win_mgn_scaled']].rename(columns = {'Lteam' : 'team', 'win_mgn_scaled' : 'mgn'})
win_mgns_losses['mgn'] *= -1
win_mgns = pd.concat([win_mgns_wins, win_mgns_losses])
return win_mgns.groupby('team').aggregate(np.std).rename(columns = {'mgn' : 'std_mgn'}).reset_index()
def attach_score_fluctuations(df, reg_season, season):
cols_to_keep = list(df.columns) + ['std_mgn_1', 'std_mgn_2']
fluct = get_score_fluctuation(reg_season, season)
fluct1 = fluct.rename(columns = {'std_mgn' : 'std_mgn_1'})
fluct2 = fluct.rename(columns = {'std_mgn' : 'std_mgn_2'})
return df\
.merge(fluct1, left_on = 'Team1', right_on = 'team')\
.merge(fluct2, left_on = 'Team2', right_on = 'team')[cols_to_keep]
In [7]:
def attach_kenpom_stats(df, kenpom, season):
cols_to_keep = list(df.columns) + ['adjem_1', 'adjem_2', 'adjt_1', 'adjt_2']
kp = kenpom.copy()
kp['AdjEM'] = kp['AdjOE'] - kp['AdjDE']
kp1 = kp[kp['Season'] == season][['Team_Id', 'AdjEM', 'AdjTempo']]\
.rename(columns = {'AdjEM' : 'adjem_1', 'AdjTempo' : 'adjt_1'})
kp2 = kp[kp['Season'] == season][['Team_Id', 'AdjEM', 'AdjTempo']]\
.rename(columns = {'AdjEM' : 'adjem_2', 'AdjTempo' : 'adjt_2'})
return df\
.merge(kp1, left_on = 'Team1', right_on = 'Team_Id')\
.merge(kp2, left_on = 'Team2', right_on = 'Team_Id')[cols_to_keep]
In [8]:
def get_root_and_leaves(hierarchy):
all_children = set(hierarchy[['Strongseed', 'Weakseed']].values.flatten())
all_parents = set(hierarchy[['Slot']].values.flatten())
root = [ p for p in all_parents if p not in all_children ][0]
leaves = [ c for c in all_children if c not in all_parents ]
return root, leaves
def get_tourney_tree_one_season(tourney_slots, season):
def calculate_depths(tree, child, root):
if child == root:
return 0
elif tree[child]['depth'] < 0:
tree[child]['depth'] = 1 + calculate_depths(tree, tree[child]['parent'], root)
return tree[child]['depth']
hierarchy = tourney_slots[tourney_slots['Season'] == season][['Slot', 'Strongseed', 'Weakseed']]
root, leaves = get_root_and_leaves(hierarchy) # should be R6CH...
tree_raw = {**dict(zip(hierarchy['Strongseed'],hierarchy['Slot'])),
**dict(zip(hierarchy['Weakseed'],hierarchy['Slot']))}
tree = { c : {'parent' : tree_raw[c], 'depth' : -1} for c in tree_raw}
for c in leaves:
calculate_depths(tree, c, root)
return tree
def get_tourney_trees(tourney_slots):
return { season : get_tourney_tree_one_season(tourney_slots, season)\
for season in tourney_slots['Season'].unique() }
def slot_matchup_from_seed(tree, seed1, seed2):
# return which slot the two teams would face off in
if seed1 == seed2:
return seed1
next_seed1 = seed1 if tree[seed1]['depth'] < tree[seed2]['depth'] else tree[seed1]['parent']
next_seed2 = seed2 if tree[seed2]['depth'] < tree[seed1]['depth'] else tree[seed2]['parent']
return slot_matchup_from_seed(tree, next_seed1, next_seed2)
def get_team_seed(tourney_seeds, season, team):
seed = tourney_seeds[
(tourney_seeds['Team'] == team) &
(tourney_seeds['Season'] == season)
]['Seed'].values
if len(seed) == 1:
return seed[0]
else:
return None
In [9]:
def dist(play_lat, play_lng, lat, lng):
return geodist((play_lat, play_lng), (lat, lng)).miles
def reg_distance_to_game(games_in, team_geog):
games = games_in.copy()
out_cols = list(games.columns) + ['w_dist', 'l_dist']
w_geog = team_geog.rename(columns = {'lat' : 'w_lat', 'lng' : 'w_lng'})
l_geog = team_geog.rename(columns = {'lat' : 'l_lat', 'lng' : 'l_lng'})
games = games\
.merge(w_geog, left_on = 'Wteam', right_on = 'team_id')\
.merge(l_geog, left_on = 'Lteam', right_on = 'team_id')
# handle neutral locations later by averaging distance from home for 2 teams if neutral location
games['play_lat'] = np.where(games['Wloc'] == 'H', games['w_lat'], games['l_lat'])
games['play_lng'] = np.where(games['Wloc'] == 'H', games['w_lng'], games['l_lng'])
games['w_dist'] = games.apply(lambda x: dist(x['play_lat'], x['play_lng'], x['w_lat'], x['w_lng']), axis = 1)
games['l_dist'] = games.apply(lambda x: dist(x['play_lat'], x['play_lng'], x['l_lat'], x['l_lng']), axis = 1)
# correct for neutral
games['w_dist'], games['l_dist'] =\
np.where(games['Wloc'] == 'N', (games['w_dist'] + games['l_dist'])/2, games['w_dist']),\
np.where(games['Wloc'] == 'N', (games['w_dist'] + games['l_dist'])/2, games['l_dist'])
return games[out_cols]
def tourney_distance_to_game(tourney_raw_in, tourney_geog, team_geog, season):
out_cols = list(tourney_raw_in.columns) + ['dist_1', 'dist_2']
tourney_raw = tourney_raw_in.copy()
geog_1 = team_geog.rename(columns = {'lat' : 'lat_1', 'lng' : 'lng_1'})
geog_2 = team_geog.rename(columns = {'lat' : 'lat_2', 'lng' : 'lng_2'})
geog_play = tourney_geog[tourney_geog['season'] == season][['slot', 'lat', 'lng']]\
.rename(columns = {'lat' : 'lat_p', 'lng' : 'lng_p'})
tourney_raw = tourney_raw\
.merge(geog_1, left_on = 'Team1', right_on = 'team_id')\
.merge(geog_2, left_on = 'Team2', right_on = 'team_id')\
.merge(geog_play, left_on = 'SlotMatchup', right_on = 'slot')
tourney_raw['dist_1'] = tourney_raw.apply(lambda x: dist(x['lat_p'], x['lng_p'], x['lat_1'], x['lng_1']), axis = 1)
tourney_raw['dist_2'] = tourney_raw.apply(lambda x: dist(x['lat_p'], x['lng_p'], x['lat_2'], x['lng_2']), axis = 1)
return tourney_raw[out_cols]
In [10]:
def attach_elom(regular_raw, tourney_raw, n_seed = 3, tau_seed = 70, tau_feature = 35):
# A bit of a beast and I might want to break this thing up
# 1. Find regular season ELO momentum
# * seed by running on long history
# * run for one season with short history (tau_feature)
# 2. Find in tourney
# * assume opponent is best that you would face
# * update elos between rounds to prevent unintended feedback loop
# * need to break out round x team then rejoin on team x team
# * if team at hand is strong seed, use second strongest
# FIRST DO REGULAR SEASON
rrc = regular_raw.sort_values(['Season', 'Daynum'])
elo_map = { team : 0 for team in set(rrc['Team1']).union(set(rrc['Team2'])) }
hscores = rrc[rrc['dist_1'] < rrc['dist_2']]['score_1'].tolist() + rrc[rrc['dist_1'] > rrc['dist_2']]['score_2'].tolist()
ascores = rrc[rrc['dist_1'] > rrc['dist_2']]['score_1'].tolist() + rrc[rrc['dist_1'] < rrc['dist_2']]['score_2'].tolist()
correction = { -1 : -(np.mean(hscores) - np.mean(ascores)), 0 : 0, 1 : np.mean(hscores) - np.mean(ascores) }
rrc['eff_score_diff'] = (rrc['score_1'] - rrc['score_2']) +\
(rrc['dist_1'] - rrc['dist_2']).map(lambda x: correction[int(np.sign(x))])
# decay time: how much history to consider?
# this can in theory be reduced during the tournament to emphasize recent performance
taus = [ tau_seed ] * n_seed + [ tau_feature ]
is_for_feature = [ False ] * n_seed + [ True ]
# seed the season -- run it a few times -- should over time bake in strength of schedule and converge
elos_1, elos_2 = [], []
for tau, is_feature in zip(taus, is_for_feature):
for team1, team2, effdiff in rrc[['Team1', 'Team2', 'eff_score_diff']].values:
elo_1, elo_2 = elo_map[team1], elo_map[team2]
# store beginning of game status if it's time (i.e. done with elo seed)
if is_feature:
elos_1 += [ elo_1 ]
elos_2 += [ elo_2 ]
# update elos
source_elo_change = np.sign(effdiff) * np.log(1 + abs(effdiff))
elo_map[team1] = (elo_2 + source_elo_change) / tau + (1 - 1/tau) * elo_1
elo_map[team2] = (elo_1 - source_elo_change) / tau + (1 - 1/tau) * elo_2
rrc['elom_1'] = elos_1
rrc['elom_2'] = elos_2
# NOW DO TOURNEY
# get top seeds per round (primary + secondary)
# join on all possible teams playing at each round
# sort by rounds and play it out against expected opponents to get elo estimates
# rejoin estimates on tourney_raw
avg_win_pts = rrc['eff_score_diff'].map(abs).mean() # some estimate of win mgn -- should this be humbled?
teams_by_round = pd.concat([
tourney_raw[['SlotMatchup', 'Team1', 'mean_rtg_1']].rename(columns = {'Team1':'Team_Id', 'mean_rtg_1':'mean_rtg'}),
tourney_raw[['SlotMatchup', 'Team2', 'mean_rtg_2']].rename(columns = {'Team2':'Team_Id', 'mean_rtg_2':'mean_rtg'}),
])\
.sort_values(['SlotMatchup', 'mean_rtg'])\
.groupby(['SlotMatchup', 'Team_Id']).head(1).reset_index(drop = True)
top_seeds_per_round = teams_by_round.groupby('SlotMatchup').head(2).reset_index(drop = True)
top_seeds_per_round['Is_Primary'] = top_seeds_per_round.index % 2 == 0
top_seeds_per_round = \
top_seeds_per_round.pivot(index = 'SlotMatchup', columns = 'Is_Primary', values = 'Team_Id').reset_index()
top_seeds_per_round = top_seeds_per_round.rename(columns = {False : 'Secondary', True : 'Primary'})
top_seeds_per_round.columns.name = None
top_seeds_per_round['Round'] = top_seeds_per_round['SlotMatchup']\
.map(lambda x: 0 if len(x) != 4 else x[1]).map(int)
momentum_elo_estimates = teams_by_round.join(top_seeds_per_round.set_index('SlotMatchup'), on = 'SlotMatchup')
momentum_elo_estimates = momentum_elo_estimates.sort_values(['Round', 'SlotMatchup', 'Team_Id']).reset_index(drop = True)
# sim the elo's but each round work on a temp copy to prevent bleed between iterations within round
elos = []
last_round = -1
elo_map_updated = elo_map.copy()
for team, primary, secondary, this_round in momentum_elo_estimates[['Team_Id', 'Secondary', 'Primary', 'Round']].values:
if last_round != this_round:
elo_map = elo_map_updated.copy()
opponent = primary if team != primary else secondary
elo, opp_elo = elo_map[team], elo_map[opponent]
elos += [elo]
source_elo_change = np.log(1 + avg_win_pts)
#elo_map_updated[team] = (opp_elo + source_elo_change) / tau_feature + (1 - 1/tau_feature) * elo
elo_map_updated[team] = max(opp_elo, elo)
momentum_elo_estimates['elom'] = elos
# attach to tourney_raw
elom_1 = momentum_elo_estimates[['SlotMatchup', 'Team_Id', 'elom']]\
.rename(columns = {'Team_Id' : 'Team1', 'elom' : 'elom_1'})
elom_2 = momentum_elo_estimates[['SlotMatchup', 'Team_Id', 'elom']]\
.rename(columns = {'Team_Id' : 'Team2', 'elom' : 'elom_2'})
tourney_raw_with_elom = tourney_raw\
.join(elom_1.set_index(['SlotMatchup', 'Team1']), on = ['SlotMatchup', 'Team1'])\
.join(elom_2.set_index(['SlotMatchup', 'Team2']), on = ['SlotMatchup', 'Team2'])
# OUTPUT
return rrc.drop('eff_score_diff', 1), tourney_raw_with_elom
In [11]:
def get_raw_reg_season_data(reg_season, team_geog, season):
cols_to_keep = ['Season', 'Daynum', 'Team1', 'Team2', 'score_1', 'score_2', 'posessions', 'dist_1', 'dist_2']
rsr = reg_season[reg_season['Season'] == season] # reg season raw
rsr = reg_distance_to_game(rsr, team_geog)
rsr['Team1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['Wteam'], rsr['Lteam'])
rsr['Team2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['Wteam'], rsr['Lteam'])
rsr['score_1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['Wscore'], rsr['Lscore'])
rsr['score_2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['Wscore'], rsr['Lscore'])
rsr['dist_1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['w_dist'], rsr['l_dist'])
rsr['dist_2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['w_dist'], rsr['l_dist'])
# From kenpom: FGA-OR+TO+0.475xFTA
rsr['posessions'] = 0.5 * (
rsr['Wfga'] - rsr['Wor'] + rsr['Wto'] + 0.475*rsr['Wfta'] +\
rsr['Lfga'] - rsr['Lor'] + rsr['Lto'] + 0.475*rsr['Lfta']
)
return rsr[cols_to_keep]
def get_raw_tourney_data(tourney_seeds, tourney_trees, tourney_geog, team_geog, season):
# tree to find play location
tree = tourney_trees[season]
# get all teams in tourney
seed_map = tourney_seeds[tourney_seeds['Season'] == season].set_index('Team').to_dict()['Seed']
teams = sorted(seed_map.keys())
team_pairs = sorted([ (team1, team2) for team1 in teams for team2 in teams if team1 < team2 ])
tourney_raw = pd.DataFrame(team_pairs).rename(columns = { 0 : 'Team1', 1 : 'Team2' })
tourney_raw['Season'] = season
# find out where they would play each other
tourney_raw['SlotMatchup'] = tourney_raw.apply(
lambda x: slot_matchup_from_seed(tree, seed_map[x['Team1']], seed_map[x['Team2']]), axis = 1
)
# get features
tourney_raw = tourney_distance_to_game(tourney_raw, tourney_geog, team_geog, season)
return tourney_raw
def attach_supplements(data, reg_season, kenpom, ratings_eos, ratings_by_day, season, use_eos_rtg = False):
dc = data.copy()
if use_eos_rtg: # get rating diff stats
dc = attach_ratings_diff_stats(dc, ratings_eos, season)
else:
dc = attach_ratings_by_day_diff_stats(dc, ratings_by_day, season)
dc = attach_kenpom_stats(dc, kenpom, season)
dc = attach_score_fluctuations(dc, reg_season, season)
return dc
In [12]:
def generate_features(df):
has_score = 'score_1' in df.columns and 'score_2' in df.columns
cols_to_keep = ['Team1', 'Team2', 'Season', 'ln_dist_diff',
'rtg_diff', 't_rtg', 'pt_diff', 't_score',
'elom_diff'] +\
(['Team1_win'] if has_score else [])
features = df.copy()
# use negative so that shorter travel distance has higher statistic
features['ln_dist_diff'] = -np.log((1 + df['dist_1'])/(1 + df['dist_2']))
features['rtg_diff'] = (df['mean_rtg_1'] - df['mean_rtg_2'])
features['t_rtg'] = features['rtg_diff'] / np.sqrt(df['std_rtg_1']**2 + df['std_rtg_2']**2)
features['pt_diff'] = df['adjem_1'] - df['adjem_2']
features['t_score'] = (df['adjem_1'] - df['adjem_2']) / np.sqrt(df['std_mgn_1']**2 + df['std_mgn_2']**2)
# elom is supposed to be a humbler -- do nothing with big diff, do a lot (humble) with a small diff?
features['elom_diff'] = df['elom_1'] - df['elom_2']
# truth feature: did team 1 win?
if has_score:
features['Team1_win'] = features['score_1'] > features['score_2']
return features[cols_to_keep]
def normalize_features(train, test, features):
all_data_raw = pd.concat([train[features], test[features]])
all_data_norm = skpp.scale(all_data_raw, with_mean = False) # with_mean = False ?
train_norm = train.copy()
test_norm = test.copy()
train_norm[features] = all_data_norm[:len(train)]
test_norm[features] = all_data_norm[len(train):]
return train_norm, test_norm
In [13]:
def get_key(df):
return df['Season'].map(str) + '_' + df['Team1'].map(str) + '_' + df['Team2'].map(str)
In [14]:
features_to_use = ['ln_dist_diff', 'rtg_diff', 't_rtg', 'pt_diff', 't_score', 'elom_diff']
#features_to_use = ['ln_dist_diff', 'rtg_diff', 't_rtg', 'pt_diff', 't_score']
#features_to_use = ['ln_dist_diff', 'rtg_diff', 't_rtg', 't_score', 'elom_diff']
#features_to_use = ['ln_dist_diff', 't_rtg', 't_score', 'elom_diff']
#features_to_use = ['ln_dist_diff', 'pt_diff', 't_score', 'elom_diff']
#features_to_use = ['ln_dist_diff', 'rtg_diff', 't_rtg', 'elom_diff']
#features_to_use = ['elom_diff']
predict_field = 'Team1_win'
In [15]:
ratings_by_day = get_ratings_by_day(ratings)
In [ ]:
def get_features(season, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog):
# support data
tourney_trees = get_tourney_trees(tourney_slots)
ratings_eos = get_eos_ratings(ratings)
#ratings_by_day = get_ratings_by_day(ratings)
# regular season cleaned data
regular_raw = get_raw_reg_season_data(reg_season, team_geog, season)
regular_raw =\
attach_supplements(regular_raw, reg_season, kenpom, ratings_eos, ratings_by_day, season, use_eos_rtg = False)
# post season cleaned data
tourney_raw = get_raw_tourney_data(tourney_seeds, tourney_trees, tourney_geog, team_geog, season)
tourney_raw =\
attach_supplements(tourney_raw, reg_season, kenpom, ratings_eos, ratings_by_day, season, use_eos_rtg = True)
# attach elo momentum
# -- this guy is different from everything else because it uses reg season to inform tourney
# -- therefore it should be treated appropriately and differently
regular_raw, tourney_raw = attach_elom(regular_raw, tourney_raw)
# get and normalize features
feat_train = generate_features(regular_raw)
feat_test = generate_features(tourney_raw)
train_norm, test_norm = normalize_features(feat_train, feat_test, features_to_use)
# playing with artificially making these coefficients higher (since lr overfits to score -- it == win)
return regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm
def make_predictions(season, train_norm, test_norm, tourney, C1 = 1, C2 = 1):
# fit
lr1 = sklm.LogisticRegression(C = C1, fit_intercept = False) # fit_intercept = False?
lr2 = sklm.LogisticRegression(C = C2, fit_intercept = False)
lr1.fit(train_norm[features_to_use].values, train_norm[predict_field].values)
lr2.fit(train_norm[features_to_use].values, train_norm[predict_field].values)
# predictions
tourney_round = tourney_raw['SlotMatchup'].map(lambda slot: 0 if len(slot) < 3 else int(slot[1])).values
probs1 = lr1.predict_proba(test_norm[tourney_round <= 1][features_to_use].values)
probs2 = lr2.predict_proba(test_norm[tourney_round > 1][features_to_use].values)
keys1 = get_key(test_norm[tourney_round <= 1])
keys2 = get_key(test_norm[tourney_round > 1])
predictions_with_round_id = pd.concat([
pd.DataFrame({'Id' : keys1.values, 'Pred' : probs1[:,1], 'first_round' : True}),
pd.DataFrame({'Id' : keys2.values, 'Pred' : probs2[:,1], 'first_round' : False})
])
predictions = predictions_with_round_id[['Id', 'Pred']]
if season in tourney['Season'].values:
# Evaluate outcomes
res_base = tourney[(tourney['Season'] == season) & (tourney['Daynum'] > 135)].copy().reset_index()
res_base['Team1'] = np.where(res_base['Wteam'] < res_base['Lteam'], res_base['Wteam'], res_base['Lteam'])
res_base['Team2'] = np.where(res_base['Wteam'] > res_base['Lteam'], res_base['Wteam'], res_base['Lteam'])
res_base['Result'] = (res_base['Wteam'] == res_base['Team1']).map(lambda x: 1 if x else 0)
res_base['Id'] = get_key(res_base)
# attach results to predictions
res = pd.merge(res_base[['Id', 'Result']], predictions_with_round_id, on = 'Id', how = 'left')
res1 = res[res['first_round'] == True]
res2 = res[res['first_round'] == False]
# logloss
ll = skm.log_loss(res['Result'], res['Pred'])
ll1 = skm.log_loss(res1['Result'], res1['Pred'])
ll2 = skm.log_loss(res2['Result'], res2['Pred'])
else:
res = ll = ll1 = ll2 = None
return predictions, res, [ll, ll1, ll2], [lr1, lr2]
In [ ]:
all_predictions = []
for season in [2014, 2015, 2016]:
regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm = \
get_features(season, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog)
# see below for choice of C
predictions, res, lls, lrs = make_predictions(season, train_norm, test_norm, tourney, C1 = 7e-3, C2 = 5e0)
print(lls, lrs[0].coef_)
all_predictions += [predictions]
In [ ]:
# 0.587211080951
# 0.564406324468
# 0.493727883134
# 0.531440236192
In [18]:
pd.concat(all_predictions).to_csv('./submissions/logisticModel_regSeason_7e-3_5e0_dist_rtg_score_20170315.csv', index = False)
In [173]:
regular_raw[train_norm['t_rtg'] < -7]
Out[173]:
In [20]:
sns.pairplot(train_norm, hue = predict_field, vars = features_to_use)
plt.show()
In [239]:
res.ix[np.argsort(-(res['Pred'] - res['Result']).abs())].reset_index(drop = True)
Out[239]:
In [136]:
# accuracy?
np.sum(np.where(res['Pred'] > 0.5, res['Result'] == 1, res['Result'] == 0)) / len(res)
Out[136]:
In [19]:
cs_to_check = np.power(10, np.arange(-4, 2 + 1e-9, 0.1))
years_to_check = range(2011, 2017)
c_effect_df_dict = { 'C' : cs_to_check }
c_effect_df_dict_first_round = c_effect_df_dict.copy()
c_effect_df_dict_later_round = c_effect_df_dict.copy()
for yr in years_to_check:
regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm = \
get_features(yr, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog)
log_losses_components = [ make_predictions(yr, train_norm, test_norm, tourney, C1 = C, C2 = C)[2]\
for C in cs_to_check ]
log_losses = [ ll[0] for ll in log_losses_components ]
log_losses_first_round = [ ll[1] for ll in log_losses_components ]
log_losses_later_round = [ ll[2] for ll in log_losses_components ]
c_effect_df_dict[str(yr)] = log_losses
c_effect_df_dict_first_round[str(yr)] = log_losses_first_round
c_effect_df_dict_later_round[str(yr)] = log_losses_later_round
c_effect = pd.DataFrame(c_effect_df_dict)
c_effect_first_round = pd.DataFrame(c_effect_df_dict_first_round)
c_effect_later_round = pd.DataFrame(c_effect_df_dict_later_round)
c_effect_all = c_effect\
.join(c_effect_first_round.set_index('C'), on = 'C', rsuffix = '_first_rnd')\
.join(c_effect_later_round.set_index('C'), on = 'C', rsuffix = '_later_rnd')
In [23]:
plt.semilogx()
for col in [ col for col in c_effect_all if col != 'C' and 'first' in col ]:
plt.plot(c_effect['C'], c_effect_all[col])
plt.legend(loc = 3)
plt.xlabel('C')
plt.ylabel('logloss')
plt.ylim(0.4, 0.75)
plt.show()
In [60]:
c_effect.min()
Out[60]:
In [ ]:
In [120]:
# contribution to logloss
rc = res.copy()
ftc = feat_test.copy()
ftc['Id'] = get_key(ftc)
rc['logloss_contrib'] = -np.log(np.where(rc['Result'] == 1, rc['Pred'], 1 - rc['Pred'])) / len(rc)
ftc = pd.merge(rc, ftc, how = 'left', on = 'Id')
fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (10, 4))
im = axes[0].scatter(ftc['t_score'], ftc['t_rtg'], c = ftc['logloss_contrib'], vmin = 0, vmax = 0.025, cmap = plt.cm.get_cmap('coolwarm'))
axes[0].set_xlabel('t_score')
axes[0].set_ylabel('t_rtg')
#plt.colorbar(sc)
axes[1].scatter(-ftc['ln_dist_diff'], ftc['t_rtg'], c = ftc['logloss_contrib'], vmin = 0, vmax = 0.025, cmap = plt.cm.get_cmap('coolwarm'))
axes[1].set_xlabel('ln_dist_diff')
cb = fig.colorbar(im, ax=axes.ravel().tolist(), label = 'logloss_contrib')
plt.show()
In [121]:
tourney_rounds = tourney_raw[['Team1', 'Team2', 'Season', 'SlotMatchup']].copy()
tourney_rounds['Id'] = get_key(tourney_rounds)
tourney_rounds['round'] = tourney_rounds['SlotMatchup'].map(lambda s: int(s[1]))
tourney_rounds = tourney_rounds[['Id', 'round']]
ftc_with_rounds = pd.merge(ftc, tourney_rounds, how = 'left', on = 'Id')
fig, axs = plt.subplots(ncols=2, figsize = (10, 4))
sns.barplot(data = ftc_with_rounds, x = 'round', y = 'logloss_contrib', errwidth = 0, ax = axs[0])
sns.barplot(data = ftc_with_rounds, x = 'round', y = 'logloss_contrib', errwidth = 0, estimator=max, ax = axs[1])
axs[0].set_ylim(0, 0.015)
axs[1].set_ylim(0, 0.06)
plt.show()
In [321]:
sns.lmplot('mean_rtg', 'std_rtg', data = ratings_eos, fit_reg = False)
plt.show()
In [353]:
ratings_eos_test = ratings_eos.copy()
ratings_eos_test['parabola_mean_model'] =(ratings_eos_test['mean_rtg'].max()/2)**2-(ratings_eos_test['mean_rtg'] - ratings_eos_test['mean_rtg'].max()/2)**2
sns.lmplot('parabola_mean_model', 'std_rtg', data = ratings_eos_test, fit_reg = False)
plt.show()
In [352]:
test_data_test = test_data.copy()
test_data_test['rtg_diff'] = test_data_test['mean_rtg_1'] - test_data_test['mean_rtg_2']
test_data_test['t_model'] = test_data_test['rtg_diff']/(test_data_test['std_rtg_1']**2 + test_data_test['std_rtg_2']**2)**0.5
#sns.lmplot('rtg_diff', 't_model', data = test_data_test, fit_reg = False)
sns.pairplot(test_data_test[['rtg_diff', 't_model']])
plt.show()
In [128]:
dist_test = get_training_data(reg_season, team_geog, 2016)
w_dist_test = dist_test[['w_dist', 'Wscore']].rename(columns = {'w_dist' : 'dist', 'Wscore' : 'score'})
l_dist_test = dist_test[['l_dist', 'Lscore']].rename(columns = {'l_dist' : 'dist', 'Lscore' : 'score'})
dist_test = pd.concat([w_dist_test, l_dist_test]).reset_index()[['dist', 'score']]
In [ ]:
plt.hist(dist_test['dist'])
plt.xlim(0, 3000)
plt.semilogy()
plt.show()
In [ ]:
bucket_size = 1
dist_test['bucket'] = bucket_size * (np.log(dist_test['dist'] + 1) // bucket_size)
dist_grp = dist_test.groupby('bucket').aggregate([np.mean, np.std, len])['score']
dist_grp['err'] = dist_grp['std'] / np.sqrt(dist_grp['len'])
In [ ]:
plt.plot(dist_grp['mean'])
plt.fill_between(dist_grp.index,
(dist_grp['mean'] - 2*dist_grp['err']).values,
(dist_grp['mean'] + 2*dist_grp['err']).values,
alpha = 0.3)
plt.xlabel('log of distance traveled')
plt.ylabel('avg score')
plt.show()