In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from geopy.distance import great_circle as geodist
import sklearn.linear_model as sklm
import sklearn.preprocessing as skpp
import sklearn.metrics as skm
import sklearn.model_selection as skms

In [2]:
# load data

reg_season = pd.read_csv('data/RegularSeasonDetailedResults.csv')
tourney = pd.read_csv('data/TourneyDetailedResults.csv')
ratings = pd.read_csv('data/addl/massey_ordinals_2003-2016.csv')
team_geog = pd.read_csv('data/addl/TeamGeog.csv')
tourney_geog = pd.read_csv('data/addl/TourneyGeog_Thru2016.csv')
tourney_slots = pd.read_csv('data/TourneySlots.csv')
tourney_seeds = pd.read_csv('data/TourneySeeds.csv')
kenpom = pd.read_csv('data/kenPomTeamData.csv')
teams = pd.read_csv('data/Teams.csv')

Almond Nut Learner

Use published rankings together with distance traveled to play to classify winners + losers

Train to regular season and test on post season

considerations:

  • Refine
    • Vegas odds in first round
    • PREDICTING UPSETS??
      • team upset rating
      • team score variance
      • upset predictors based on past seasons
    • Ratings closer to date played
    • Model tuning / hyperparameter tuning
  • Implemented
    • individual ratings vs aggregate
      • Look at aggregate and derive statistics
    • diff vs absolute ratings
      • Use diffs for feature generation
    • only use final rankings instead of those at time of play?
      • For now: time of play
    • Distance from home? Distance from last game?
      • For now: distance from home
    • How do regular season and playoffs differ in features?
      • Is using distance in playoffs trained on regular season right?
  • Augment (not yet executed)
    • Defensive / offense ratings from kenpom
    • Elo, Elo differences, and assoc probabilities
    • Ensemble?
      • Construct micro-classifier from elo
    • Coaches
    • Look at momentum + OT effects when training
    • Beginning of season vs end of season for training

In [3]:
def attach_ratings_diff_stats(df, ratings_eos, season):
    out_cols = list(df.columns) + ['mean_rtg_1', 'std_rtg_1', 'num_rtg_1', 'mean_rtg_2', 'std_rtg_2', 'num_rtg_2']
    rtg_1 = ratings_eos.rename(columns = {'mean_rtg' : 'mean_rtg_1', 'std_rtg' : 'std_rtg_1', 'num_rtg' : 'num_rtg_1'})
    rtg_2 = ratings_eos.rename(columns = {'mean_rtg' : 'mean_rtg_2', 'std_rtg' : 'std_rtg_2', 'num_rtg' : 'num_rtg_2'})
    return df\
        .merge(rtg_1, left_on = ['Season', 'Team1'], right_on = ['season', 'team'])\
        .merge(rtg_2, left_on = ['Season', 'Team2'], right_on = ['season', 'team'])\
        [out_cols]

def get_eos_ratings(ratings):
    ratings_last_day = ratings.groupby('season').aggregate(max)[['rating_day_num']].reset_index()
    ratings_eos_all = ratings_last_day\
        .merge(ratings, left_on = ['season', 'rating_day_num'], right_on = ['season', 'rating_day_num'])
    ratings_eos = ratings_eos_all.groupby(['season', 'team']).aggregate([np.mean, np.std, len])['orank']
    return ratings_eos.reset_index().rename(columns = {'mean' : 'mean_rtg', 'std' : 'std_rtg', 'len' : 'num_rtg'})

In [20]:
def get_score_fluctuation(reg_season, season):
    # note: quick and dirty; not best practice for home / away etc b/c these would only improve est for
    # std on second order
    # scale the score spreads by # posessions
    # note: units don't really matter because this is used in a ratio and is normalized later
    
    rsc = reg_season[reg_season['Season'] == season].copy()
        
    # avg home vs away
    hscores = rsc[rsc['Wloc'] == 'H']['Wscore'].tolist() + rsc[rsc['Wloc'] == 'A']['Lscore'].tolist()
    ascores = rsc[rsc['Wloc'] == 'A']['Wscore'].tolist() + rsc[rsc['Wloc'] == 'H']['Lscore'].tolist()
    home_correction = np.mean(hscores) - np.mean(ascores)
    
    # get posessions per game
    posessions = 0.5 * (
        rsc['Lfga'] - rsc['Lor'] + rsc['Lto'] + 0.475*rsc['Lfta'] +\
        rsc['Wfga'] - rsc['Wor'] + rsc['Wto'] + 0.475*rsc['Wfta']
    )
    
    # get victory margins and correct for home / away -- scale for posessions
    rsc['win_mgn'] = rsc['Wscore'] - rsc['Lscore']
    rsc['win_mgn'] += np.where(rsc['Wloc'] == 'H', -home_correction, 0)
    rsc['win_mgn'] += np.where(rsc['Wloc'] == 'A',  home_correction, 0)
    rsc['win_mgn_scaled'] = rsc['win_mgn'] * 100 / posessions # score per 100 posessions
    
    # get mgn of victory stats per team
    win_mgns_wins = rsc[['Wteam', 'win_mgn_scaled']].rename(columns = {'Wteam' : 'team', 'win_mgn_scaled' : 'mgn'})
    win_mgns_losses = rsc[['Lteam', 'win_mgn_scaled']].rename(columns = {'Lteam' : 'team', 'win_mgn_scaled' : 'mgn'})
    win_mgns_losses['mgn'] *= -1
    win_mgns = pd.concat([win_mgns_wins, win_mgns_losses])
    
    return win_mgns.groupby('team').aggregate(np.std).rename(columns = {'mgn' : 'std_mgn'}).reset_index()

def attach_score_fluctuations(df, reg_season, season):
    cols_to_keep = list(df.columns) + ['std_mgn_1', 'std_mgn_2']
    
    fluct = get_score_fluctuation(reg_season, season)
    fluct1 = fluct.rename(columns = {'std_mgn' : 'std_mgn_1'})
    fluct2 = fluct.rename(columns = {'std_mgn' : 'std_mgn_2'})
    return df\
        .merge(fluct1, left_on = 'Team1', right_on = 'team')\
        .merge(fluct2, left_on = 'Team2', right_on = 'team')[cols_to_keep]

In [5]:
def attach_kenpom_stats(df, kenpom, season):
    cols_to_keep = list(df.columns) + ['adjem_1', 'adjem_2', 'adjt_1', 'adjt_2']
    
    kp1 = kenpom[kenpom['Season'] == season][['Team_Id', 'AdjEM', 'AdjTempo']]\
        .rename(columns = {'AdjEM' : 'adjem_1', 'AdjTempo' : 'adjt_1'})
    kp2 = kenpom[kenpom['Season'] == season][['Team_Id', 'AdjEM', 'AdjTempo']]\
        .rename(columns = {'AdjEM' : 'adjem_2', 'AdjTempo' : 'adjt_2'})
    return df\
        .merge(kp1, left_on = 'Team1', right_on = 'Team_Id')\
        .merge(kp2, left_on = 'Team2', right_on = 'Team_Id')[cols_to_keep]

In [6]:
def get_root_and_leaves(hierarchy):
    all_children = set(hierarchy[['Strongseed', 'Weakseed']].values.flatten())
    all_parents = set(hierarchy[['Slot']].values.flatten())
    root = [ p for p in all_parents if p not in all_children ][0]
    leaves = [ c for c in all_children if c not in all_parents ]
    return root, leaves

def get_tourney_tree_one_season(tourney_slots, season):
    
    def calculate_depths(tree, child, root):
        if child == root:
            return 0
        elif tree[child]['depth'] < 0:
            tree[child]['depth'] = 1 + calculate_depths(tree, tree[child]['parent'], root)
        return tree[child]['depth']
        
    hierarchy = tourney_slots[tourney_slots['Season'] == season][['Slot', 'Strongseed', 'Weakseed']]
    root, leaves = get_root_and_leaves(hierarchy) # should be R6CH...
    tree_raw = {**dict(zip(hierarchy['Strongseed'],hierarchy['Slot'])), 
                **dict(zip(hierarchy['Weakseed'],hierarchy['Slot']))}
    tree = { c : {'parent' : tree_raw[c], 'depth' : -1} for c in tree_raw}
    
    for c in leaves:
        calculate_depths(tree, c, root)
    
    return tree

def get_tourney_trees(tourney_slots):
    return { season : get_tourney_tree_one_season(tourney_slots, season)\
        for season in tourney_slots['Season'].unique() }

def slot_matchup_from_seed(tree, seed1, seed2):
    # return which slot the two teams would face off in
    if seed1 == seed2:
        return seed1
    next_seed1 = seed1 if tree[seed1]['depth'] < tree[seed2]['depth'] else tree[seed1]['parent']
    next_seed2 = seed2 if tree[seed2]['depth'] < tree[seed1]['depth'] else tree[seed2]['parent']
    return slot_matchup_from_seed(tree, next_seed1, next_seed2)

def get_team_seed(tourney_seeds, season, team):
    seed = tourney_seeds[
        (tourney_seeds['Team'] == team) & 
        (tourney_seeds['Season'] == season)
    ]['Seed'].values
    if len(seed) == 1:
        return seed[0]
    else:
        return None

In [7]:
def dist(play_lat, play_lng, lat, lng):
    return geodist((play_lat, play_lng), (lat, lng)).miles

def reg_distance_to_game(games_in, team_geog):
    
    games = games_in.copy()
    out_cols = list(games.columns) + ['w_dist', 'l_dist']
    
    w_geog = team_geog.rename(columns = {'lat' : 'w_lat', 'lng' : 'w_lng'})
    l_geog = team_geog.rename(columns = {'lat' : 'l_lat', 'lng' : 'l_lng'})
    games = games\
        .merge(w_geog, left_on = 'Wteam', right_on = 'team_id')\
        .merge(l_geog, left_on = 'Lteam', right_on = 'team_id')
    # handle neutral locations later by averaging distance from home for 2 teams if neutral location
    games['play_lat'] = np.where(games['Wloc'] == 'H', games['w_lat'], games['l_lat'])
    games['play_lng'] = np.where(games['Wloc'] == 'H', games['w_lng'], games['l_lng'])
    games['w_dist'] = games.apply(lambda x: dist(x['play_lat'], x['play_lng'], x['w_lat'], x['w_lng']), axis = 1)
    games['l_dist'] = games.apply(lambda x: dist(x['play_lat'], x['play_lng'], x['l_lat'], x['l_lng']), axis = 1)
    # correct for neutral
    games['w_dist'],  games['l_dist']  =\
        np.where(games['Wloc'] == 'N', (games['w_dist'] + games['l_dist'])/2, games['w_dist']),\
        np.where(games['Wloc'] == 'N', (games['w_dist'] + games['l_dist'])/2, games['l_dist'])
    return games[out_cols]

def tourney_distance_to_game(tourney_raw_in, tourney_geog, team_geog, season):
   
    out_cols = list(tourney_raw_in.columns) + ['dist_1', 'dist_2']

    tourney_raw = tourney_raw_in.copy()
    
    geog_1 = team_geog.rename(columns = {'lat' : 'lat_1', 'lng' : 'lng_1'})
    geog_2 = team_geog.rename(columns = {'lat' : 'lat_2', 'lng' : 'lng_2'})
    geog_play = tourney_geog[tourney_geog['season'] == season][['slot', 'lat', 'lng']]\
        .rename(columns = {'lat' : 'lat_p', 'lng' : 'lng_p'})
    
    tourney_raw = tourney_raw\
        .merge(geog_1, left_on = 'Team1', right_on = 'team_id')\
        .merge(geog_2, left_on = 'Team2', right_on = 'team_id')\
        .merge(geog_play, left_on = 'SlotMatchup', right_on = 'slot')
   
    tourney_raw['dist_1'] = tourney_raw.apply(lambda x: dist(x['lat_p'], x['lng_p'], x['lat_1'], x['lng_1']), axis = 1)
    tourney_raw['dist_2'] = tourney_raw.apply(lambda x: dist(x['lat_p'], x['lng_p'], x['lat_2'], x['lng_2']), axis = 1)
    
    return tourney_raw[out_cols]

In [8]:
def get_raw_reg_season_data(reg_season, team_geog, season):
    
    cols_to_keep = ['Season', 'Daynum', 'Team1', 'Team2', 'score_1', 'score_2', 'dist_1', 'dist_2']
    
    rsr = reg_season[reg_season['Season'] == season] # reg season raw
    rsr = reg_distance_to_game(rsr, team_geog)
    
    rsr['Team1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['Wteam'], rsr['Lteam'])
    rsr['Team2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['Wteam'], rsr['Lteam'])
    rsr['score_1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['Wscore'], rsr['Lscore'])
    rsr['score_2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['Wscore'], rsr['Lscore'])
    rsr['dist_1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['w_dist'], rsr['l_dist'])
    rsr['dist_2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['w_dist'], rsr['l_dist'])
    
    return rsr[cols_to_keep]

def get_raw_tourney_data(tourney_seeds, tourney_trees, tourney_geog, team_geog, season):
    
    # tree to find play location
    tree = tourney_trees[season]
    
    # get all teams in tourney
    seed_map = tourney_seeds[tourney_seeds['Season'] == season].set_index('Team').to_dict()['Seed']
    teams = sorted(seed_map.keys())
    
    team_pairs = sorted([ (team1, team2) for team1 in teams for team2 in teams if team1 < team2 ])
    tourney_raw = pd.DataFrame(team_pairs).rename(columns = { 0 : 'Team1', 1 : 'Team2' })
    tourney_raw['Season'] = season
    
    # find out where they would play each other
    tourney_raw['SlotMatchup'] = tourney_raw.apply(
        lambda x: slot_matchup_from_seed(tree, seed_map[x['Team1']], seed_map[x['Team2']]), axis = 1
    )
    
    # get features
    tourney_raw = tourney_distance_to_game(tourney_raw, tourney_geog, team_geog, season)
    
    return tourney_raw

def attach_supplements(data, reg_season, kenpom, ratings_eos, season):
    
    dc = data.copy()
    dc = attach_ratings_diff_stats(dc, ratings_eos, season) # get ratings diff stats
    dc = attach_kenpom_stats(dc, kenpom, season)
    dc = attach_score_fluctuations(dc, reg_season, season)
    
    return dc

Feature engineering

  • Log of distance
  • Capture rating diffs
  • Capture rating diffs acct for variance (t score)
  • Diff in expected scores via EM diffs

Tag winners in training set + viz. Also, normalize data.


In [9]:
def generate_features(df):
    
    has_score = 'score_1' in df.columns and 'score_2' in df.columns
    
    cols_to_keep = ['Team1', 'Team2', 'Season', 'ln_dist_diff', 'rtg_diff', 't_rtg', 'pt_diff', 't_score'] +\
        (['Team1_win'] if has_score else [])
    
    features = df.copy()
    features['ln_dist_diff'] = np.log((1 + df['dist_1'])/(1 + df['dist_2']))
    # use negative for t_rtg so that better team has higher statistic than worse team
    features['rtg_diff'] = -(df['mean_rtg_1'] - df['mean_rtg_2']) 
    features['t_rtg'] = -(df['mean_rtg_1'] - df['mean_rtg_2']) / np.sqrt(df['std_rtg_1']**2 + df['std_rtg_2']**2)
    features['pt_diff'] = df['adjem_1'] - df['adjem_2']
    features['t_score'] = (df['adjem_1'] - df['adjem_2']) / np.sqrt(df['std_mgn_1']**2 + df['std_mgn_2']**2)
    
    # truth feature: did team 1 win?
    if has_score:
        features['Team1_win'] = features['score_1'] > features['score_2']
    
    return features[cols_to_keep]

def normalize_features(train, test, features):
    all_data_raw = pd.concat([train[features], test[features]])
    all_data_norm = skpp.scale(all_data_raw) # with_mean = False ?
    train_norm = train.copy()
    test_norm = test.copy()
    train_norm[features] = all_data_norm[:len(train)]
    test_norm[features] = all_data_norm[len(train):]
    return train_norm, test_norm

In [10]:
def get_key(df):
    return df['Season'].map(str) + '_' + df['Team1'].map(str) + '_' + df['Team2'].map(str)

Running the model


In [21]:
features_to_use = ['ln_dist_diff', 'rtg_diff', 't_rtg', 'pt_diff', 't_score']
predict_field = 'Team1_win'

def get_features(season, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog):
    
    # support data
    tourney_trees = get_tourney_trees(tourney_slots)
    ratings_eos = get_eos_ratings(ratings)
    
    # regular season cleaned data
    regular_raw = get_raw_reg_season_data(reg_season, team_geog, season)
    regular_raw = attach_supplements(regular_raw, reg_season, kenpom, ratings_eos, season)
    
    # post season cleaned data
    tourney_raw = get_raw_tourney_data(tourney_seeds, tourney_trees, tourney_geog, team_geog, season)
    tourney_raw = attach_supplements(tourney_raw, reg_season, kenpom, ratings_eos, season)
    
    # get and normalize features
    feat_train = generate_features(regular_raw)
    feat_test = generate_features(tourney_raw)
    train_norm, test_norm = normalize_features(feat_train, feat_test, features_to_use)
    
    return regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm

def make_predictions(season, train_norm, test_norm, tourney, C = 1):
    
    # fit
    lr = sklm.LogisticRegression(C = C) # fit_intercept = False???
    lr.fit(train_norm[features_to_use].values, train_norm[predict_field].values)

    # predictions
    probs = lr.predict_proba(test_norm[features_to_use].values)
    keys = get_key(test_norm)
    predictions = pd.DataFrame({'Id' : keys.values, 'Pred' : probs[:,1]})
    
    # Evaluate outcomes
    res_base = tourney[(tourney['Season'] == season) & (tourney['Daynum'] > 135)].copy().reset_index()
    res_base['Team1'] = np.where(res_base['Wteam'] < res_base['Lteam'], res_base['Wteam'], res_base['Lteam'])
    res_base['Team2'] = np.where(res_base['Wteam'] > res_base['Lteam'], res_base['Wteam'], res_base['Lteam'])
    res_base['Result'] = (res_base['Wteam'] == res_base['Team1']).map(lambda x: 1 if x else 0)
    res_base['Id'] = get_key(res_base) 
    # attach results to predictions
    res = pd.merge(res_base[['Id', 'Result']], predictions, on = 'Id', how = 'left')
    # logloss
    ll = skm.log_loss(res['Result'], res['Pred'])
    
#     print(lr.intercept_)
#     print(lr.coef_)
    
    return predictions, res, ll

In [28]:
all_predictions = []
for season in [2013, 2014, 2015, 2016]:
    regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm = \
        get_features(season, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog)
    # see below for choice of C
    predictions, res, ll = make_predictions(season, train_norm, test_norm, tourney, C = 5e-3)
    print(ll)
    all_predictions += [predictions]


0.558642330544
0.542229424684
0.480054395486
0.510823217386

In [ ]:
# 0.559078513104 -- 2013
# 0.541984791608 -- 2014
# 0.480356337664 -- 2015
# 0.511671826092 -- 2016

In [477]:
pd.concat(all_predictions).to_csv('./submissions/simpleLogisticModel2013to2016_tuned.csv', index = False)

In [632]:
sns.pairplot(train_norm, hue = predict, vars = ['ln_dist_diff', 'rtg_diff', 't_rtg', 'pt_diff', 't_score'])
plt.show()


Sandbox explorations


In [309]:
teams[teams['Team_Id'].isin([1163, 1196])]


Out[309]:
Team_Id Team_Name
62 1163 Connecticut
95 1196 Florida

In [310]:
tourney_raw[(tourney_raw['Team1'] == 1163) & (tourney_raw['Team2'] == 1196)]


Out[310]:
Team1 Team2 Season SlotMatchup dist_1 dist_2 mean_rtg_1 std_rtg_1 num_rtg_1 mean_rtg_2 std_rtg_2 num_rtg_2 adjem_1 adjem_2 std_mgn_1 std_mgn_2
2025 1163 1196 2014 R5WX 1498.536286 899.245448 22.907692 4.920376 65 2.092308 1.388829 65 22.11 28.55 15.27287 10.145172

In [311]:
feat_test[(feat_test['Team1'] == 1195) & (feat_test['Team2'] == 1196)]


Out[311]:
Team1 Team2 Season ln_dist_diff rtg_diff t_rtg pt_diff t_score
2025 1163 1196 2014 0.510244 -20.815385 -4.071369 -6.44 -0.351234

In [571]:
res.ix[np.argsort(-(res['Pred'] - res['Result']).abs())].reset_index(drop = True)


Out[571]:
Id Result Pred
0 2013_1195_1207 1 0.132318
1 2013_1217_1307 1 0.155442
2 2013_1195_1361 1 0.232046
3 2013_1211_1455 0 0.717839
4 2013_1243_1247 0 0.694749
5 2013_1326_1455 0 0.655845
6 2013_1332_1387 1 0.360817
7 2013_1247_1279 1 0.385783
8 2013_1279_1458 1 0.387495
9 2013_1301_1396 0 0.611246
10 2013_1266_1274 1 0.407874
11 2013_1231_1393 0 0.591224
12 2013_1329_1332 0 0.583771
13 2013_1338_1455 0 0.576519
14 2013_1143_1424 1 0.444394
15 2013_1235_1323 1 0.447536
16 2013_1196_1276 0 0.527638
17 2013_1161_1281 1 0.475924
18 2013_1272_1388 1 0.503020
19 2013_1242_1276 0 0.481214
20 2013_1160_1228 0 0.477644
21 2013_1276_1393 1 0.548947
22 2013_1139_1266 0 0.448682
23 2013_1112_1326 0 0.442424
24 2013_1181_1277 1 0.567102
25 2013_1328_1361 0 0.424199
26 2013_1278_1417 1 0.592172
27 2013_1181_1257 0 0.389304
28 2013_1137_1139 0 0.388393
29 2013_1314_1437 1 0.612684
... ... ... ...
33 2013_1257_1276 1 0.625686
34 2013_1276_1433 1 0.627119
35 2013_1112_1125 1 0.652192
36 2013_1242_1314 1 0.655322
37 2013_1247_1455 0 0.342686
38 2013_1166_1181 0 0.342179
39 2013_1143_1393 0 0.339969
40 2013_1172_1266 0 0.331516
41 2013_1103_1433 0 0.323716
42 2013_1228_1274 0 0.321120
43 2013_1196_1278 1 0.687194
44 2013_1235_1326 0 0.260738
45 2013_1161_1257 0 0.236103
46 2013_1277_1434 1 0.777846
47 2013_1257_1455 1 0.783571
48 2013_1257_1332 1 0.788143
49 2013_1231_1396 1 0.817128
50 2013_1308_1387 0 0.175497
51 2013_1112_1217 1 0.841982
52 2013_1276_1355 1 0.846034
53 2013_1285_1393 0 0.151820
54 2013_1233_1326 0 0.116557
55 2013_1274_1334 1 0.884251
56 2013_1195_1196 0 0.099069
57 2013_1196_1322 1 0.905434
58 2013_1107_1181 0 0.091412
59 2013_1211_1380 1 0.932522
60 2013_1242_1443 1 0.941689
61 2013_1231_1241 1 0.958962
62 2013_1257_1299 1 0.982159

63 rows × 3 columns


In [287]:
# accuracy?
np.sum(np.where(res['Pred'] > 0.5, res['Result'] == 1, res['Result'] == 0)) / len(res)


Out[287]:
0.69841269841269837

Effect of C on different years


In [23]:
cs_to_check = np.power(10, np.arange(-4, 2, 0.1))
years_to_check = range(2011, 2017)
c_effect_df_dict = { 'C' : cs_to_check }
for yr in years_to_check:
    regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm = \
        get_features(yr, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog)
    log_losses = [ make_predictions(yr, train_norm, test_norm, tourney, C = C)[2] for C in cs_to_check ]
    c_effect_df_dict[str(yr)] = log_losses
c_effect = pd.DataFrame(c_effect_df_dict)

In [24]:
plt.semilogx()
for col in [ col for col in c_effect if col != 'C' ]:
    plt.plot(c_effect['C'], c_effect[col])
plt.legend(loc = 3)
plt.xlabel('C')
plt.ylabel('logloss')
plt.ylim(0.45, 0.65)
plt.show()


Look at who is contributing to logloss


In [25]:
# contribution to logloss
rc = res.copy()
ftc = feat_test.copy()
ftc['Id'] = get_key(ftc)
rc['logloss_contrib'] = -np.log(np.where(rc['Result'] == 1, rc['Pred'], 1 - rc['Pred'])) / len(rc)
ftc = pd.merge(rc, ftc, how = 'left', on = 'Id')

fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (10, 4))
im = axes[0].scatter(ftc['t_score'], ftc['t_rtg'], c = ftc['logloss_contrib'], vmin = 0, vmax = 0.025, cmap = plt.cm.get_cmap('coolwarm'))
axes[0].set_xlabel('t_score')
axes[0].set_ylabel('t_rtg')
#plt.colorbar(sc)
axes[1].scatter(-ftc['ln_dist_diff'], ftc['t_rtg'], c = ftc['logloss_contrib'], vmin = 0, vmax = 0.025, cmap = plt.cm.get_cmap('coolwarm'))
axes[1].set_xlabel('ln_dist_diff')
cb = fig.colorbar(im, ax=axes.ravel().tolist(), label = 'logloss_contrib')
plt.show()


Logloss contribution by round


In [26]:
tourney_rounds = tourney_raw[['Team1', 'Team2', 'Season', 'SlotMatchup']].copy()
tourney_rounds['Id'] = get_key(tourney_rounds)
tourney_rounds['round'] = tourney_rounds['SlotMatchup'].map(lambda s: int(s[1]))
tourney_rounds = tourney_rounds[['Id', 'round']]
ftc_with_rounds = pd.merge(ftc, tourney_rounds, how = 'left', on = 'Id')

fig, axs = plt.subplots(ncols=2, figsize = (10, 4))
sns.barplot(data = ftc_with_rounds, x = 'round', y = 'logloss_contrib', errwidth = 0, ax = axs[0])
sns.barplot(data = ftc_with_rounds, x = 'round', y = 'logloss_contrib', errwidth = 0, estimator=max, ax = axs[1])
axs[0].set_ylim(0, 0.035)
axs[1].set_ylim(0, 0.035)
plt.show()


Overtime counts


In [398]:
sns.barplot(data = reg_season[reg_season['Season'] > 2000], x = 'Season', y = 'Numot', errwidth = 0)
plt.show()


A look at dynamics of ratings data


In [321]:
sns.lmplot('mean_rtg', 'std_rtg', data = ratings_eos, fit_reg = False)
plt.show()



In [353]:
ratings_eos_test = ratings_eos.copy()
ratings_eos_test['parabola_mean_model'] =(ratings_eos_test['mean_rtg'].max()/2)**2-(ratings_eos_test['mean_rtg'] - ratings_eos_test['mean_rtg'].max()/2)**2
sns.lmplot('parabola_mean_model', 'std_rtg', data = ratings_eos_test, fit_reg = False)
plt.show()



In [352]:
test_data_test = test_data.copy()
test_data_test['rtg_diff'] = test_data_test['mean_rtg_1'] - test_data_test['mean_rtg_2']
test_data_test['t_model'] = test_data_test['rtg_diff']/(test_data_test['std_rtg_1']**2 + test_data_test['std_rtg_2']**2)**0.5
#sns.lmplot('rtg_diff', 't_model', data = test_data_test, fit_reg = False)
sns.pairplot(test_data_test[['rtg_diff', 't_model']])
plt.show()


Quick investigation: looks like avg score decreases with log of distance traveled


In [247]:
dist_test = get_training_data(reg_season, team_geog, 2016)
w_dist_test = dist_test[['w_dist', 'Wscore']].rename(columns = {'w_dist' : 'dist', 'Wscore' : 'score'})
l_dist_test = dist_test[['l_dist', 'Lscore']].rename(columns = {'l_dist' : 'dist', 'Lscore' : 'score'})
dist_test = pd.concat([w_dist_test, l_dist_test]).reset_index()[['dist', 'score']]

In [270]:
plt.hist(dist_test['dist'])
plt.xlim(0, 3000)
plt.semilogy()
plt.show()



In [324]:
bucket_size = 1
dist_test['bucket'] = bucket_size * (np.log(dist_test['dist'] + 1) // bucket_size)
dist_grp = dist_test.groupby('bucket').aggregate([np.mean, np.std, len])['score']
dist_grp['err'] = dist_grp['std'] / np.sqrt(dist_grp['len'])

In [325]:
plt.plot(dist_grp['mean'])
plt.fill_between(dist_grp.index, 
                 (dist_grp['mean'] - 2*dist_grp['err']).values, 
                 (dist_grp['mean'] + 2*dist_grp['err']).values,
                 alpha = 0.3)
plt.xlabel('log of distance traveled')
plt.ylabel('avg score')
plt.show()