In [1]:

    
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from geopy.distance import great_circle as geodist
import sklearn.linear_model as sklm
import sklearn.preprocessing as skpp
import sklearn.metrics as skm
import sklearn.model_selection as skms



In [2]:

    
# load data

reg_season = pd.read_csv('data/RegularSeasonDetailedResults.csv')
tourney = pd.read_csv('data/TourneyDetailedResults.csv')
ratings = pd.read_csv('data/addl/massey_ordinals_2003-2016.csv')
team_geog = pd.read_csv('data/addl/TeamGeog.csv')
tourney_geog = pd.read_csv('data/addl/TourneyGeog_Thru2016.csv')
tourney_slots = pd.read_csv('data/TourneySlots.csv')
tourney_seeds = pd.read_csv('data/TourneySeeds.csv')
kenpom = pd.read_csv('data/kenPomTeamData.csv')
teams = pd.read_csv('data/Teams.csv')

Almond Nut Learner

Use published rankings together with distance traveled to play to classify winners + losers

Train to regular season and test on post season

considerations:

Refine
- Vegas odds in first round
- PREDICTING UPSETS??
  - team upset rating
  - team score variance
  - upset predictors based on past seasons
- Ratings closer to date played
- Model tuning / hyperparameter tuning
Implemented
- individual ratings vs aggregate
  - Look at aggregate and derive statistics
- diff vs absolute ratings
  - Use diffs for feature generation
- only use final rankings instead of those at time of play?
  - For now: time of play
- Distance from home? Distance from last game?
  - For now: distance from home
- How do regular season and playoffs differ in features?
  - Is using distance in playoffs trained on regular season right?
Augment (not yet executed)
- Defensive / offense ratings from kenpom
- Elo, Elo differences, and assoc probabilities
- Ensemble?
  - Construct micro-classifier from elo
- Coaches
- Look at momentum + OT effects when training
- Beginning of season vs end of season for training



In [3]:

    
def attach_ratings_diff_stats(df, ratings_eos, season):
    out_cols = list(df.columns) + ['mean_rtg_1', 'std_rtg_1', 'num_rtg_1', 'mean_rtg_2', 'std_rtg_2', 'num_rtg_2']
    rtg_1 = ratings_eos.rename(columns = {'mean_rtg' : 'mean_rtg_1', 'std_rtg' : 'std_rtg_1', 'num_rtg' : 'num_rtg_1'})
    rtg_2 = ratings_eos.rename(columns = {'mean_rtg' : 'mean_rtg_2', 'std_rtg' : 'std_rtg_2', 'num_rtg' : 'num_rtg_2'})
    return df\
        .merge(rtg_1, left_on = ['Season', 'Team1'], right_on = ['season', 'team'])\
        .merge(rtg_2, left_on = ['Season', 'Team2'], right_on = ['season', 'team'])\
        [out_cols]

def get_eos_ratings(ratings):
    ratings_last_day = ratings.groupby('season').aggregate(max)[['rating_day_num']].reset_index()
    ratings_eos_all = ratings_last_day\
        .merge(ratings, left_on = ['season', 'rating_day_num'], right_on = ['season', 'rating_day_num'])
    ratings_eos = ratings_eos_all.groupby(['season', 'team']).aggregate([np.mean, np.std, len])['orank']
    return ratings_eos.reset_index().rename(columns = {'mean' : 'mean_rtg', 'std' : 'std_rtg', 'len' : 'num_rtg'})



In [20]:

    
def get_score_fluctuation(reg_season, season):
    # note: quick and dirty; not best practice for home / away etc b/c these would only improve est for
    # std on second order
    # scale the score spreads by # posessions
    # note: units don't really matter because this is used in a ratio and is normalized later
    
    rsc = reg_season[reg_season['Season'] == season].copy()
        
    # avg home vs away
    hscores = rsc[rsc['Wloc'] == 'H']['Wscore'].tolist() + rsc[rsc['Wloc'] == 'A']['Lscore'].tolist()
    ascores = rsc[rsc['Wloc'] == 'A']['Wscore'].tolist() + rsc[rsc['Wloc'] == 'H']['Lscore'].tolist()
    home_correction = np.mean(hscores) - np.mean(ascores)
    
    # get posessions per game
    posessions = 0.5 * (
        rsc['Lfga'] - rsc['Lor'] + rsc['Lto'] + 0.475*rsc['Lfta'] +\
        rsc['Wfga'] - rsc['Wor'] + rsc['Wto'] + 0.475*rsc['Wfta']
    )
    
    # get victory margins and correct for home / away -- scale for posessions
    rsc['win_mgn'] = rsc['Wscore'] - rsc['Lscore']
    rsc['win_mgn'] += np.where(rsc['Wloc'] == 'H', -home_correction, 0)
    rsc['win_mgn'] += np.where(rsc['Wloc'] == 'A',  home_correction, 0)
    rsc['win_mgn_scaled'] = rsc['win_mgn'] * 100 / posessions # score per 100 posessions
    
    # get mgn of victory stats per team
    win_mgns_wins = rsc[['Wteam', 'win_mgn_scaled']].rename(columns = {'Wteam' : 'team', 'win_mgn_scaled' : 'mgn'})
    win_mgns_losses = rsc[['Lteam', 'win_mgn_scaled']].rename(columns = {'Lteam' : 'team', 'win_mgn_scaled' : 'mgn'})
    win_mgns_losses['mgn'] *= -1
    win_mgns = pd.concat([win_mgns_wins, win_mgns_losses])
    
    return win_mgns.groupby('team').aggregate(np.std).rename(columns = {'mgn' : 'std_mgn'}).reset_index()

def attach_score_fluctuations(df, reg_season, season):
    cols_to_keep = list(df.columns) + ['std_mgn_1', 'std_mgn_2']
    
    fluct = get_score_fluctuation(reg_season, season)
    fluct1 = fluct.rename(columns = {'std_mgn' : 'std_mgn_1'})
    fluct2 = fluct.rename(columns = {'std_mgn' : 'std_mgn_2'})
    return df\
        .merge(fluct1, left_on = 'Team1', right_on = 'team')\
        .merge(fluct2, left_on = 'Team2', right_on = 'team')[cols_to_keep]



In [5]:

    
def attach_kenpom_stats(df, kenpom, season):
    cols_to_keep = list(df.columns) + ['adjem_1', 'adjem_2', 'adjt_1', 'adjt_2']
    
    kp1 = kenpom[kenpom['Season'] == season][['Team_Id', 'AdjEM', 'AdjTempo']]\
        .rename(columns = {'AdjEM' : 'adjem_1', 'AdjTempo' : 'adjt_1'})
    kp2 = kenpom[kenpom['Season'] == season][['Team_Id', 'AdjEM', 'AdjTempo']]\
        .rename(columns = {'AdjEM' : 'adjem_2', 'AdjTempo' : 'adjt_2'})
    return df\
        .merge(kp1, left_on = 'Team1', right_on = 'Team_Id')\
        .merge(kp2, left_on = 'Team2', right_on = 'Team_Id')[cols_to_keep]



In [6]:

    
def get_root_and_leaves(hierarchy):
    all_children = set(hierarchy[['Strongseed', 'Weakseed']].values.flatten())
    all_parents = set(hierarchy[['Slot']].values.flatten())
    root = [ p for p in all_parents if p not in all_children ][0]
    leaves = [ c for c in all_children if c not in all_parents ]
    return root, leaves

def get_tourney_tree_one_season(tourney_slots, season):
    
    def calculate_depths(tree, child, root):
        if child == root:
            return 0
        elif tree[child]['depth'] < 0:
            tree[child]['depth'] = 1 + calculate_depths(tree, tree[child]['parent'], root)
        return tree[child]['depth']
        
    hierarchy = tourney_slots[tourney_slots['Season'] == season][['Slot', 'Strongseed', 'Weakseed']]
    root, leaves = get_root_and_leaves(hierarchy) # should be R6CH...
    tree_raw = {**dict(zip(hierarchy['Strongseed'],hierarchy['Slot'])), 
                **dict(zip(hierarchy['Weakseed'],hierarchy['Slot']))}
    tree = { c : {'parent' : tree_raw[c], 'depth' : -1} for c in tree_raw}
    
    for c in leaves:
        calculate_depths(tree, c, root)
    
    return tree

def get_tourney_trees(tourney_slots):
    return { season : get_tourney_tree_one_season(tourney_slots, season)\
        for season in tourney_slots['Season'].unique() }

def slot_matchup_from_seed(tree, seed1, seed2):
    # return which slot the two teams would face off in
    if seed1 == seed2:
        return seed1
    next_seed1 = seed1 if tree[seed1]['depth'] < tree[seed2]['depth'] else tree[seed1]['parent']
    next_seed2 = seed2 if tree[seed2]['depth'] < tree[seed1]['depth'] else tree[seed2]['parent']
    return slot_matchup_from_seed(tree, next_seed1, next_seed2)

def get_team_seed(tourney_seeds, season, team):
    seed = tourney_seeds[
        (tourney_seeds['Team'] == team) & 
        (tourney_seeds['Season'] == season)
    ]['Seed'].values
    if len(seed) == 1:
        return seed[0]
    else:
        return None



In [7]:

    
def dist(play_lat, play_lng, lat, lng):
    return geodist((play_lat, play_lng), (lat, lng)).miles

def reg_distance_to_game(games_in, team_geog):
    
    games = games_in.copy()
    out_cols = list(games.columns) + ['w_dist', 'l_dist']
    
    w_geog = team_geog.rename(columns = {'lat' : 'w_lat', 'lng' : 'w_lng'})
    l_geog = team_geog.rename(columns = {'lat' : 'l_lat', 'lng' : 'l_lng'})
    games = games\
        .merge(w_geog, left_on = 'Wteam', right_on = 'team_id')\
        .merge(l_geog, left_on = 'Lteam', right_on = 'team_id')
    # handle neutral locations later by averaging distance from home for 2 teams if neutral location
    games['play_lat'] = np.where(games['Wloc'] == 'H', games['w_lat'], games['l_lat'])
    games['play_lng'] = np.where(games['Wloc'] == 'H', games['w_lng'], games['l_lng'])
    games['w_dist'] = games.apply(lambda x: dist(x['play_lat'], x['play_lng'], x['w_lat'], x['w_lng']), axis = 1)
    games['l_dist'] = games.apply(lambda x: dist(x['play_lat'], x['play_lng'], x['l_lat'], x['l_lng']), axis = 1)
    # correct for neutral
    games['w_dist'],  games['l_dist']  =\
        np.where(games['Wloc'] == 'N', (games['w_dist'] + games['l_dist'])/2, games['w_dist']),\
        np.where(games['Wloc'] == 'N', (games['w_dist'] + games['l_dist'])/2, games['l_dist'])
    return games[out_cols]

def tourney_distance_to_game(tourney_raw_in, tourney_geog, team_geog, season):
   
    out_cols = list(tourney_raw_in.columns) + ['dist_1', 'dist_2']

    tourney_raw = tourney_raw_in.copy()
    
    geog_1 = team_geog.rename(columns = {'lat' : 'lat_1', 'lng' : 'lng_1'})
    geog_2 = team_geog.rename(columns = {'lat' : 'lat_2', 'lng' : 'lng_2'})
    geog_play = tourney_geog[tourney_geog['season'] == season][['slot', 'lat', 'lng']]\
        .rename(columns = {'lat' : 'lat_p', 'lng' : 'lng_p'})
    
    tourney_raw = tourney_raw\
        .merge(geog_1, left_on = 'Team1', right_on = 'team_id')\
        .merge(geog_2, left_on = 'Team2', right_on = 'team_id')\
        .merge(geog_play, left_on = 'SlotMatchup', right_on = 'slot')
   
    tourney_raw['dist_1'] = tourney_raw.apply(lambda x: dist(x['lat_p'], x['lng_p'], x['lat_1'], x['lng_1']), axis = 1)
    tourney_raw['dist_2'] = tourney_raw.apply(lambda x: dist(x['lat_p'], x['lng_p'], x['lat_2'], x['lng_2']), axis = 1)
    
    return tourney_raw[out_cols]



In [8]:

    
def get_raw_reg_season_data(reg_season, team_geog, season):
    
    cols_to_keep = ['Season', 'Daynum', 'Team1', 'Team2', 'score_1', 'score_2', 'dist_1', 'dist_2']
    
    rsr = reg_season[reg_season['Season'] == season] # reg season raw
    rsr = reg_distance_to_game(rsr, team_geog)
    
    rsr['Team1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['Wteam'], rsr['Lteam'])
    rsr['Team2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['Wteam'], rsr['Lteam'])
    rsr['score_1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['Wscore'], rsr['Lscore'])
    rsr['score_2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['Wscore'], rsr['Lscore'])
    rsr['dist_1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['w_dist'], rsr['l_dist'])
    rsr['dist_2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['w_dist'], rsr['l_dist'])
    
    return rsr[cols_to_keep]

def get_raw_tourney_data(tourney_seeds, tourney_trees, tourney_geog, team_geog, season):
    
    # tree to find play location
    tree = tourney_trees[season]
    
    # get all teams in tourney
    seed_map = tourney_seeds[tourney_seeds['Season'] == season].set_index('Team').to_dict()['Seed']
    teams = sorted(seed_map.keys())
    
    team_pairs = sorted([ (team1, team2) for team1 in teams for team2 in teams if team1 < team2 ])
    tourney_raw = pd.DataFrame(team_pairs).rename(columns = { 0 : 'Team1', 1 : 'Team2' })
    tourney_raw['Season'] = season
    
    # find out where they would play each other
    tourney_raw['SlotMatchup'] = tourney_raw.apply(
        lambda x: slot_matchup_from_seed(tree, seed_map[x['Team1']], seed_map[x['Team2']]), axis = 1
    )
    
    # get features
    tourney_raw = tourney_distance_to_game(tourney_raw, tourney_geog, team_geog, season)
    
    return tourney_raw

def attach_supplements(data, reg_season, kenpom, ratings_eos, season):
    
    dc = data.copy()
    dc = attach_ratings_diff_stats(dc, ratings_eos, season) # get ratings diff stats
    dc = attach_kenpom_stats(dc, kenpom, season)
    dc = attach_score_fluctuations(dc, reg_season, season)
    
    return dc

Feature engineering

Log of distance
Capture rating diffs
Capture rating diffs acct for variance (t score)
Diff in expected scores via EM diffs

Tag winners in training set + viz. Also, normalize data.



In [9]:

    
def generate_features(df):
    
    has_score = 'score_1' in df.columns and 'score_2' in df.columns
    
    cols_to_keep = ['Team1', 'Team2', 'Season', 'ln_dist_diff', 'rtg_diff', 't_rtg', 'pt_diff', 't_score'] +\
        (['Team1_win'] if has_score else [])
    
    features = df.copy()
    features['ln_dist_diff'] = np.log((1 + df['dist_1'])/(1 + df['dist_2']))
    # use negative for t_rtg so that better team has higher statistic than worse team
    features['rtg_diff'] = -(df['mean_rtg_1'] - df['mean_rtg_2']) 
    features['t_rtg'] = -(df['mean_rtg_1'] - df['mean_rtg_2']) / np.sqrt(df['std_rtg_1']**2 + df['std_rtg_2']**2)
    features['pt_diff'] = df['adjem_1'] - df['adjem_2']
    features['t_score'] = (df['adjem_1'] - df['adjem_2']) / np.sqrt(df['std_mgn_1']**2 + df['std_mgn_2']**2)
    
    # truth feature: did team 1 win?
    if has_score:
        features['Team1_win'] = features['score_1'] > features['score_2']
    
    return features[cols_to_keep]

def normalize_features(train, test, features):
    all_data_raw = pd.concat([train[features], test[features]])
    all_data_norm = skpp.scale(all_data_raw) # with_mean = False ?
    train_norm = train.copy()
    test_norm = test.copy()
    train_norm[features] = all_data_norm[:len(train)]
    test_norm[features] = all_data_norm[len(train):]
    return train_norm, test_norm



In [10]:

    
def get_key(df):
    return df['Season'].map(str) + '_' + df['Team1'].map(str) + '_' + df['Team2'].map(str)

Running the model



In [21]:

    
features_to_use = ['ln_dist_diff', 'rtg_diff', 't_rtg', 'pt_diff', 't_score']
predict_field = 'Team1_win'

def get_features(season, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog):
    
    # support data
    tourney_trees = get_tourney_trees(tourney_slots)
    ratings_eos = get_eos_ratings(ratings)
    
    # regular season cleaned data
    regular_raw = get_raw_reg_season_data(reg_season, team_geog, season)
    regular_raw = attach_supplements(regular_raw, reg_season, kenpom, ratings_eos, season)
    
    # post season cleaned data
    tourney_raw = get_raw_tourney_data(tourney_seeds, tourney_trees, tourney_geog, team_geog, season)
    tourney_raw = attach_supplements(tourney_raw, reg_season, kenpom, ratings_eos, season)
    
    # get and normalize features
    feat_train = generate_features(regular_raw)
    feat_test = generate_features(tourney_raw)
    train_norm, test_norm = normalize_features(feat_train, feat_test, features_to_use)
    
    return regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm

def make_predictions(season, train_norm, test_norm, tourney, C = 1):
    
    # fit
    lr = sklm.LogisticRegression(C = C) # fit_intercept = False???
    lr.fit(train_norm[features_to_use].values, train_norm[predict_field].values)

    # predictions
    probs = lr.predict_proba(test_norm[features_to_use].values)
    keys = get_key(test_norm)
    predictions = pd.DataFrame({'Id' : keys.values, 'Pred' : probs[:,1]})
    
    # Evaluate outcomes
    res_base = tourney[(tourney['Season'] == season) & (tourney['Daynum'] > 135)].copy().reset_index()
    res_base['Team1'] = np.where(res_base['Wteam'] < res_base['Lteam'], res_base['Wteam'], res_base['Lteam'])
    res_base['Team2'] = np.where(res_base['Wteam'] > res_base['Lteam'], res_base['Wteam'], res_base['Lteam'])
    res_base['Result'] = (res_base['Wteam'] == res_base['Team1']).map(lambda x: 1 if x else 0)
    res_base['Id'] = get_key(res_base) 
    # attach results to predictions
    res = pd.merge(res_base[['Id', 'Result']], predictions, on = 'Id', how = 'left')
    # logloss
    ll = skm.log_loss(res['Result'], res['Pred'])
    
#     print(lr.intercept_)
#     print(lr.coef_)
    
    return predictions, res, ll



In [28]:

    
all_predictions = []
for season in [2013, 2014, 2015, 2016]:
    regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm = \
        get_features(season, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog)
    # see below for choice of C
    predictions, res, ll = make_predictions(season, train_norm, test_norm, tourney, C = 5e-3)
    print(ll)
    all_predictions += [predictions]









    



0.558642330544
0.542229424684
0.480054395486
0.510823217386



In [ ]:

    
# 0.559078513104 -- 2013
# 0.541984791608 -- 2014
# 0.480356337664 -- 2015
# 0.511671826092 -- 2016



In [477]:

    
pd.concat(all_predictions).to_csv('./submissions/simpleLogisticModel2013to2016_tuned.csv', index = False)



In [632]:

    
sns.pairplot(train_norm, hue = predict, vars = ['ln_dist_diff', 'rtg_diff', 't_rtg', 'pt_diff', 't_score'])
plt.show()

Sandbox explorations



In [309]:

    
teams[teams['Team_Id'].isin([1163, 1196])]









    Out[309]:






  
    
      
      Team_Id
      Team_Name
    
  
  
    
      62
      1163
      Connecticut
    
    
      95
      1196
      Florida



In [310]:

    
tourney_raw[(tourney_raw['Team1'] == 1163) & (tourney_raw['Team2'] == 1196)]









    Out[310]:






  
    
      
      Team1
      Team2
      Season
      SlotMatchup
      dist_1
      dist_2
      mean_rtg_1
      std_rtg_1
      num_rtg_1
      mean_rtg_2
      std_rtg_2
      num_rtg_2
      adjem_1
      adjem_2
      std_mgn_1
      std_mgn_2
    
  
  
    
      2025
      1163
      1196
      2014
      R5WX
      1498.536286
      899.245448
      22.907692
      4.920376
      65
      2.092308
      1.388829
      65
      22.11
      28.55
      15.27287
      10.145172



In [311]:

    
feat_test[(feat_test['Team1'] == 1195) & (feat_test['Team2'] == 1196)]









    Out[311]:






  
    
      
      Team1
      Team2
      Season
      ln_dist_diff
      rtg_diff
      t_rtg
      pt_diff
      t_score
    
  
  
    
      2025
      1163
      1196
      2014
      0.510244
      -20.815385
      -4.071369
      -6.44
      -0.351234



In [571]:

    
res.ix[np.argsort(-(res['Pred'] - res['Result']).abs())].reset_index(drop = True)









    Out[571]:






  
    
      
      Id
      Result
      Pred
    
  
  
    
      0
      2013_1195_1207
      1
      0.132318
    
    
      1
      2013_1217_1307
      1
      0.155442
    
    
      2
      2013_1195_1361
      1
      0.232046
    
    
      3
      2013_1211_1455
      0
      0.717839
    
    
      4
      2013_1243_1247
      0
      0.694749
    
    
      5
      2013_1326_1455
      0
      0.655845
    
    
      6
      2013_1332_1387
      1
      0.360817
    
    
      7
      2013_1247_1279
      1
      0.385783
    
    
      8
      2013_1279_1458
      1
      0.387495
    
    
      9
      2013_1301_1396
      0
      0.611246
    
    
      10
      2013_1266_1274
      1
      0.407874
    
    
      11
      2013_1231_1393
      0
      0.591224
    
    
      12
      2013_1329_1332
      0
      0.583771
    
    
      13
      2013_1338_1455
      0
      0.576519
    
    
      14
      2013_1143_1424
      1
      0.444394
    
    
      15
      2013_1235_1323
      1
      0.447536
    
    
      16
      2013_1196_1276
      0
      0.527638
    
    
      17
      2013_1161_1281
      1
      0.475924
    
    
      18
      2013_1272_1388
      1
      0.503020
    
    
      19
      2013_1242_1276
      0
      0.481214
    
    
      20
      2013_1160_1228
      0
      0.477644
    
    
      21
      2013_1276_1393
      1
      0.548947
    
    
      22
      2013_1139_1266
      0
      0.448682
    
    
      23
      2013_1112_1326
      0
      0.442424
    
    
      24
      2013_1181_1277
      1
      0.567102
    
    
      25
      2013_1328_1361
      0
      0.424199
    
    
      26
      2013_1278_1417
      1
      0.592172
    
    
      27
      2013_1181_1257
      0
      0.389304
    
    
      28
      2013_1137_1139
      0
      0.388393
    
    
      29
      2013_1314_1437
      1
      0.612684
    
    
      ...
      ...
      ...
      ...
    
    
      33
      2013_1257_1276
      1
      0.625686
    
    
      34
      2013_1276_1433
      1
      0.627119
    
    
      35
      2013_1112_1125
      1
      0.652192
    
    
      36
      2013_1242_1314
      1
      0.655322
    
    
      37
      2013_1247_1455
      0
      0.342686
    
    
      38
      2013_1166_1181
      0
      0.342179
    
    
      39
      2013_1143_1393
      0
      0.339969
    
    
      40
      2013_1172_1266
      0
      0.331516
    
    
      41
      2013_1103_1433
      0
      0.323716
    
    
      42
      2013_1228_1274
      0
      0.321120
    
    
      43
      2013_1196_1278
      1
      0.687194
    
    
      44
      2013_1235_1326
      0
      0.260738
    
    
      45
      2013_1161_1257
      0
      0.236103
    
    
      46
      2013_1277_1434
      1
      0.777846
    
    
      47
      2013_1257_1455
      1
      0.783571
    
    
      48
      2013_1257_1332
      1
      0.788143
    
    
      49
      2013_1231_1396
      1
      0.817128
    
    
      50
      2013_1308_1387
      0
      0.175497
    
    
      51
      2013_1112_1217
      1
      0.841982
    
    
      52
      2013_1276_1355
      1
      0.846034
    
    
      53
      2013_1285_1393
      0
      0.151820
    
    
      54
      2013_1233_1326
      0
      0.116557
    
    
      55
      2013_1274_1334
      1
      0.884251
    
    
      56
      2013_1195_1196
      0
      0.099069
    
    
      57
      2013_1196_1322
      1
      0.905434
    
    
      58
      2013_1107_1181
      0
      0.091412
    
    
      59
      2013_1211_1380
      1
      0.932522
    
    
      60
      2013_1242_1443
      1
      0.941689
    
    
      61
      2013_1231_1241
      1
      0.958962
    
    
      62
      2013_1257_1299
      1
      0.982159
    
  

63 rows × 3 columns



In [287]:

    
# accuracy?
np.sum(np.where(res['Pred'] > 0.5, res['Result'] == 1, res['Result'] == 0)) / len(res)









    Out[287]:





0.69841269841269837

Effect of C on different years



In [23]:

    
cs_to_check = np.power(10, np.arange(-4, 2, 0.1))
years_to_check = range(2011, 2017)
c_effect_df_dict = { 'C' : cs_to_check }
for yr in years_to_check:
    regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm = \
        get_features(yr, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog)
    log_losses = [ make_predictions(yr, train_norm, test_norm, tourney, C = C)[2] for C in cs_to_check ]
    c_effect_df_dict[str(yr)] = log_losses
c_effect = pd.DataFrame(c_effect_df_dict)



In [24]:

    
plt.semilogx()
for col in [ col for col in c_effect if col != 'C' ]:
    plt.plot(c_effect['C'], c_effect[col])
plt.legend(loc = 3)
plt.xlabel('C')
plt.ylabel('logloss')
plt.ylim(0.45, 0.65)
plt.show()

Look at who is contributing to logloss



In [25]:

    
# contribution to logloss
rc = res.copy()
ftc = feat_test.copy()
ftc['Id'] = get_key(ftc)
rc['logloss_contrib'] = -np.log(np.where(rc['Result'] == 1, rc['Pred'], 1 - rc['Pred'])) / len(rc)
ftc = pd.merge(rc, ftc, how = 'left', on = 'Id')

fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (10, 4))
im = axes[0].scatter(ftc['t_score'], ftc['t_rtg'], c = ftc['logloss_contrib'], vmin = 0, vmax = 0.025, cmap = plt.cm.get_cmap('coolwarm'))
axes[0].set_xlabel('t_score')
axes[0].set_ylabel('t_rtg')
#plt.colorbar(sc)
axes[1].scatter(-ftc['ln_dist_diff'], ftc['t_rtg'], c = ftc['logloss_contrib'], vmin = 0, vmax = 0.025, cmap = plt.cm.get_cmap('coolwarm'))
axes[1].set_xlabel('ln_dist_diff')
cb = fig.colorbar(im, ax=axes.ravel().tolist(), label = 'logloss_contrib')
plt.show()

Logloss contribution by round



In [26]:

    
tourney_rounds = tourney_raw[['Team1', 'Team2', 'Season', 'SlotMatchup']].copy()
tourney_rounds['Id'] = get_key(tourney_rounds)
tourney_rounds['round'] = tourney_rounds['SlotMatchup'].map(lambda s: int(s[1]))
tourney_rounds = tourney_rounds[['Id', 'round']]
ftc_with_rounds = pd.merge(ftc, tourney_rounds, how = 'left', on = 'Id')

fig, axs = plt.subplots(ncols=2, figsize = (10, 4))
sns.barplot(data = ftc_with_rounds, x = 'round', y = 'logloss_contrib', errwidth = 0, ax = axs[0])
sns.barplot(data = ftc_with_rounds, x = 'round', y = 'logloss_contrib', errwidth = 0, estimator=max, ax = axs[1])
axs[0].set_ylim(0, 0.035)
axs[1].set_ylim(0, 0.035)
plt.show()

Overtime counts



In [398]:

    
sns.barplot(data = reg_season[reg_season['Season'] > 2000], x = 'Season', y = 'Numot', errwidth = 0)
plt.show()

A look at dynamics of ratings data



In [321]:

    
sns.lmplot('mean_rtg', 'std_rtg', data = ratings_eos, fit_reg = False)
plt.show()



In [353]:

    
ratings_eos_test = ratings_eos.copy()
ratings_eos_test['parabola_mean_model'] =(ratings_eos_test['mean_rtg'].max()/2)**2-(ratings_eos_test['mean_rtg'] - ratings_eos_test['mean_rtg'].max()/2)**2
sns.lmplot('parabola_mean_model', 'std_rtg', data = ratings_eos_test, fit_reg = False)
plt.show()



In [352]:

    
test_data_test = test_data.copy()
test_data_test['rtg_diff'] = test_data_test['mean_rtg_1'] - test_data_test['mean_rtg_2']
test_data_test['t_model'] = test_data_test['rtg_diff']/(test_data_test['std_rtg_1']**2 + test_data_test['std_rtg_2']**2)**0.5
#sns.lmplot('rtg_diff', 't_model', data = test_data_test, fit_reg = False)
sns.pairplot(test_data_test[['rtg_diff', 't_model']])
plt.show()

Quick investigation: looks like avg score decreases with log of distance traveled



In [247]:

    
dist_test = get_training_data(reg_season, team_geog, 2016)
w_dist_test = dist_test[['w_dist', 'Wscore']].rename(columns = {'w_dist' : 'dist', 'Wscore' : 'score'})
l_dist_test = dist_test[['l_dist', 'Lscore']].rename(columns = {'l_dist' : 'dist', 'Lscore' : 'score'})
dist_test = pd.concat([w_dist_test, l_dist_test]).reset_index()[['dist', 'score']]



In [270]:

    
plt.hist(dist_test['dist'])
plt.xlim(0, 3000)
plt.semilogy()
plt.show()



In [324]:

    
bucket_size = 1
dist_test['bucket'] = bucket_size * (np.log(dist_test['dist'] + 1) // bucket_size)
dist_grp = dist_test.groupby('bucket').aggregate([np.mean, np.std, len])['score']
dist_grp['err'] = dist_grp['std'] / np.sqrt(dist_grp['len'])



In [325]:

    
plt.plot(dist_grp['mean'])
plt.fill_between(dist_grp.index, 
                 (dist_grp['mean'] - 2*dist_grp['err']).values, 
                 (dist_grp['mean'] + 2*dist_grp['err']).values,
                 alpha = 0.3)
plt.xlabel('log of distance traveled')
plt.ylabel('avg score')
plt.show()

	Id	Result	Pred
0	2013_1195_1207	1	0.132318
1	2013_1217_1307	1	0.155442
2	2013_1195_1361	1	0.232046
3	2013_1211_1455	0	0.717839
4	2013_1243_1247	0	0.694749
5	2013_1326_1455	0	0.655845
6	2013_1332_1387	1	0.360817
7	2013_1247_1279	1	0.385783
8	2013_1279_1458	1	0.387495
9	2013_1301_1396	0	0.611246
10	2013_1266_1274	1	0.407874
11	2013_1231_1393	0	0.591224
12	2013_1329_1332	0	0.583771
13	2013_1338_1455	0	0.576519
14	2013_1143_1424	1	0.444394
15	2013_1235_1323	1	0.447536
16	2013_1196_1276	0	0.527638
17	2013_1161_1281	1	0.475924
18	2013_1272_1388	1	0.503020
19	2013_1242_1276	0	0.481214
20	2013_1160_1228	0	0.477644
21	2013_1276_1393	1	0.548947
22	2013_1139_1266	0	0.448682
23	2013_1112_1326	0	0.442424
24	2013_1181_1277	1	0.567102
25	2013_1328_1361	0	0.424199
26	2013_1278_1417	1	0.592172
27	2013_1181_1257	0	0.389304
28	2013_1137_1139	0	0.388393
29	2013_1314_1437	1	0.612684
...	...	...	...
33	2013_1257_1276	1	0.625686
34	2013_1276_1433	1	0.627119
35	2013_1112_1125	1	0.652192
36	2013_1242_1314	1	0.655322
37	2013_1247_1455	0	0.342686
38	2013_1166_1181	0	0.342179
39	2013_1143_1393	0	0.339969
40	2013_1172_1266	0	0.331516
41	2013_1103_1433	0	0.323716
42	2013_1228_1274	0	0.321120
43	2013_1196_1278	1	0.687194
44	2013_1235_1326	0	0.260738
45	2013_1161_1257	0	0.236103
46	2013_1277_1434	1	0.777846
47	2013_1257_1455	1	0.783571
48	2013_1257_1332	1	0.788143
49	2013_1231_1396	1	0.817128
50	2013_1308_1387	0	0.175497
51	2013_1112_1217	1	0.841982
52	2013_1276_1355	1	0.846034
53	2013_1285_1393	0	0.151820
54	2013_1233_1326	0	0.116557
55	2013_1274_1334	1	0.884251
56	2013_1195_1196	0	0.099069
57	2013_1196_1322	1	0.905434
58	2013_1107_1181	0	0.091412
59	2013_1211_1380	1	0.932522
60	2013_1242_1443	1	0.941689
61	2013_1231_1241	1	0.958962
62	2013_1257_1299	1	0.982159