In [1]:

    
import pandas as pd
import numpy as np
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt
from geopy.distance import great_circle as geodist
import sklearn.linear_model as sklm
import sklearn.preprocessing as skpp
import sklearn.metrics as skm
import sklearn.model_selection as skms



In [2]:

    
# load data

reg_season = pd.read_csv('data/RegularSeasonDetailedResults.csv')
tourney = pd.read_csv('data/TourneyDetailedResults.csv')
team_geog = pd.read_csv('data/addl/TeamGeog.csv')
tourney_geog = pd.read_csv('data/addl/TourneyGeog.csv')
tourney_slots = pd.read_csv('data/TourneySlots.csv')
tourney_seeds = pd.read_csv('data/TourneySeeds.csv')
kenpom = pd.read_csv('data/kenPomTeamData.csv')
teams = pd.read_csv('data/Teams.csv')
spreads = pd.read_csv('data/addl/point_spreads.csv')

ratings_pre2017 = pd.read_csv('data/addl/massey_ordinals_2003-2016.csv')
ratings_2017 = pd.read_csv('data/addl/MasseyOrdinals_2017_ThruDay133_68systems.csv')
ratings = pd.concat([ratings_pre2017, ratings_2017])

Almond Nut Learner

Use published rankings together with distance traveled to play to classify winners + losers

Train to regular season and test on post season

TWEAKS TO TRY

Combine preseason w postseason
Ranking: stretch out when top -- i.e. 1 to 2 should be similar to 5 to 20 type thing
Zero mean etc
Change C for first round vs subsequent rounds

considerations:

Refine
- Vegas odds in first round
- PREDICTING UPSETS??
  - team upset rating
  - team score variance
  - upset predictors based on past seasons
- Ratings closer to date played
- Model tuning / hyperparameter tuning
Implemented
- individual ratings vs aggregate
  - Look at aggregate and derive statistics
- diff vs absolute ratings
  - Use diffs for feature generation
- only use final rankings instead of those at time of play?
  - For now: time of play
- Distance from home? Distance from last game?
  - For now: distance from home
- How do regular season and playoffs differ in features?
  - Is using distance in playoffs trained on regular season right?
Augment (not yet executed)
- Defensive / offense ratings from kenpom
- Elo, Elo differences, and assoc probabilities
- Ensemble?
  - Construct micro-classifier from elo
- Coaches
- Look at momentum + OT effects when training
- Beginning of season vs end of season for training



In [3]:

    
def normalize_ratings(some_rater):
    cache = {}
    
    def get_eff_rank(rank, max_rank):
        if (rank, max_rank) not in cache:
            percentile = 1 - int(rank) / (max_rank + 1)
            cache[(rank, max_rank)] = ss.norm.ppf(percentile)
        return cache[(rank, max_rank)]
    
    sr = some_rater.copy()
    num_per_season = sr.groupby('season').max()['orank'].to_dict()
    sr['eff_rtg'] = sr.apply(lambda x: get_eff_rank(x['orank'], num_per_season[x['season']]), axis = 1)
    return sr



In [4]:

    
def attach_ratings_diff_stats(df, ratings_eos, season):
    out_cols = list(df.columns) + ['mean_rtg_1', 'std_rtg_1', 'num_rtg_1', 'mean_rtg_2', 'std_rtg_2', 'num_rtg_2']
    rtg_1 = ratings_eos.rename(columns = {'mean_rtg' : 'mean_rtg_1', 'std_rtg' : 'std_rtg_1', 'num_rtg' : 'num_rtg_1'})
    rtg_2 = ratings_eos.rename(columns = {'mean_rtg' : 'mean_rtg_2', 'std_rtg' : 'std_rtg_2', 'num_rtg' : 'num_rtg_2'})
    return df\
        .merge(rtg_1, left_on = ['Season', 'Team1'], right_on = ['season', 'team'])\
        .merge(rtg_2, left_on = ['Season', 'Team2'], right_on = ['season', 'team'])\
        [out_cols]

def get_eos_ratings(ratings):
    ratings_last_day = ratings.groupby('season').aggregate(max)[['rating_day_num']].reset_index()
    ratings_eos_all = ratings_last_day\
        .merge(ratings, left_on = ['season', 'rating_day_num'], right_on = ['season', 'rating_day_num'])
    ratings_eos_all = normalize_ratings(ratings_eos_all)
    ratings_eos = ratings_eos_all.groupby(['season', 'team']).aggregate([np.mean, np.std, len])['eff_rtg']
    return ratings_eos.reset_index().rename(columns = {'mean' : 'mean_rtg', 'std' : 'std_rtg', 'len' : 'num_rtg'})



In [5]:

    
def attach_ratings_by_day_diff_stats(df, ratings_by_day, season):
    out_cols = list(df.columns) + ['mean_rtg_1', 'std_rtg_1', 'num_rtg_1', 'mean_rtg_2', 'std_rtg_2', 'num_rtg_2']
    rtg_1 = ratings_by_day.rename(columns = {'mean_rtg' : 'mean_rtg_1', 'std_rtg' : 'std_rtg_1', 'num_rtg' : 'num_rtg_1'})
    rtg_2 = ratings_by_day.rename(columns = {'mean_rtg' : 'mean_rtg_2', 'std_rtg' : 'std_rtg_2', 'num_rtg' : 'num_rtg_2'})
    return df\
        .merge(rtg_1, left_on = ['Season', 'Daynum', 'Team1'], right_on = ['season', 'rating_day_num', 'team'])\
        .merge(rtg_2, left_on = ['Season', 'Daynum', 'Team2'], right_on = ['season', 'rating_day_num', 'team'])\
        [out_cols]

def get_ratings_by_day(ratings):
    # find closest rating, drag forward (i.e. most recent rating) -- then drag up
    rc = ratings.copy()
    rc = normalize_ratings(rc)
    ratings_by_day_stats = rc.groupby(['season', 'rating_day_num', 'team'])\
        .aggregate([np.mean, np.std, len])['eff_rtg']\
        .rename(columns = {'mean' : 'mean_rtg', 'std' : 'std_rtg', 'len' : 'num_rtg'})\
        .reset_index()
    all_teams = rc['team'].unique()
    all_day_teams = pd.DataFrame(pd.DataFrame(
        [ (year, day, team) for year in range(2000, rc['season'].max()+1)\
             for day in range(200)\
             for team in all_teams
        ])
    ).rename(columns = {0 : 'season', 1 : 'rating_day_num', 2 : 'team'})
    ratings_by_day_holes = pd.merge(all_day_teams, ratings_by_day_stats,
                                    on = ['season', 'rating_day_num', 'team'], 
                                    how = 'left')
    ratings_by_day = ratings_by_day_holes.sort_values(['season', 'rating_day_num'])\
        .groupby(['season', 'team']).ffill()\
        .groupby(['season', 'team']).bfill()
    return ratings_by_day



In [6]:

    
def get_score_fluctuation(reg_season, season):
    # note: quick and dirty; not best practice for home / away etc b/c these would only improve est for
    # std on second order
    # scale the score spreads by # posessions
    # note: units don't really matter because this is used in a ratio and is normalized later
    
    rsc = reg_season[reg_season['Season'] == season].copy()
        
    # avg home vs away
    hscores = rsc[rsc['Wloc'] == 'H']['Wscore'].tolist() + rsc[rsc['Wloc'] == 'A']['Lscore'].tolist()
    ascores = rsc[rsc['Wloc'] == 'A']['Wscore'].tolist() + rsc[rsc['Wloc'] == 'H']['Lscore'].tolist()
    home_correction = np.mean(hscores) - np.mean(ascores)
    
    # get posessions per game
    posessions = 0.5 * (
        rsc['Lfga'] - rsc['Lor'] + rsc['Lto'] + 0.475*rsc['Lfta'] +\
        rsc['Wfga'] - rsc['Wor'] + rsc['Wto'] + 0.475*rsc['Wfta']
    )
    
    # get victory margins and correct for home / away -- scale for posessions
    rsc['win_mgn'] = rsc['Wscore'] - rsc['Lscore']
    rsc['win_mgn'] += np.where(rsc['Wloc'] == 'H', -home_correction, 0)
    rsc['win_mgn'] += np.where(rsc['Wloc'] == 'A',  home_correction, 0)
    rsc['win_mgn_scaled'] = rsc['win_mgn'] * 100 / posessions # score per 100 posessions
    
    # get mgn of victory stats per team
    win_mgns_wins = rsc[['Wteam', 'win_mgn_scaled']].rename(columns = {'Wteam' : 'team', 'win_mgn_scaled' : 'mgn'})
    win_mgns_losses = rsc[['Lteam', 'win_mgn_scaled']].rename(columns = {'Lteam' : 'team', 'win_mgn_scaled' : 'mgn'})
    win_mgns_losses['mgn'] *= -1
    win_mgns = pd.concat([win_mgns_wins, win_mgns_losses])
    
    return win_mgns.groupby('team').aggregate(np.std).rename(columns = {'mgn' : 'std_mgn'}).reset_index()

def attach_score_fluctuations(df, reg_season, season):
    cols_to_keep = list(df.columns) + ['std_mgn_1', 'std_mgn_2']
    
    fluct = get_score_fluctuation(reg_season, season)
    fluct1 = fluct.rename(columns = {'std_mgn' : 'std_mgn_1'})
    fluct2 = fluct.rename(columns = {'std_mgn' : 'std_mgn_2'})
    return df\
        .merge(fluct1, left_on = 'Team1', right_on = 'team')\
        .merge(fluct2, left_on = 'Team2', right_on = 'team')[cols_to_keep]



In [7]:

    
def attach_kenpom_stats(df, kenpom, season):
    cols_to_keep = list(df.columns) + ['adjem_1', 'adjem_2', 'adjt_1', 'adjt_2']
    
    kp = kenpom.copy()
    kp['AdjEM'] = kp['AdjOE'] - kp['AdjDE']
    
    kp1 = kp[kp['Season'] == season][['Team_Id', 'AdjEM', 'AdjTempo']]\
        .rename(columns = {'AdjEM' : 'adjem_1', 'AdjTempo' : 'adjt_1'})
    kp2 = kp[kp['Season'] == season][['Team_Id', 'AdjEM', 'AdjTempo']]\
        .rename(columns = {'AdjEM' : 'adjem_2', 'AdjTempo' : 'adjt_2'})
    return df\
        .merge(kp1, left_on = 'Team1', right_on = 'Team_Id')\
        .merge(kp2, left_on = 'Team2', right_on = 'Team_Id')[cols_to_keep]



In [8]:

    
def get_root_and_leaves(hierarchy):
    all_children = set(hierarchy[['Strongseed', 'Weakseed']].values.flatten())
    all_parents = set(hierarchy[['Slot']].values.flatten())
    root = [ p for p in all_parents if p not in all_children ][0]
    leaves = [ c for c in all_children if c not in all_parents ]
    return root, leaves

def get_tourney_tree_one_season(tourney_slots, season):
    
    def calculate_depths(tree, child, root):
        if child == root:
            return 0
        elif tree[child]['depth'] < 0:
            tree[child]['depth'] = 1 + calculate_depths(tree, tree[child]['parent'], root)
        return tree[child]['depth']
        
    hierarchy = tourney_slots[tourney_slots['Season'] == season][['Slot', 'Strongseed', 'Weakseed']]
    root, leaves = get_root_and_leaves(hierarchy) # should be R6CH...
    tree_raw = {**dict(zip(hierarchy['Strongseed'],hierarchy['Slot'])), 
                **dict(zip(hierarchy['Weakseed'],hierarchy['Slot']))}
    tree = { c : {'parent' : tree_raw[c], 'depth' : -1} for c in tree_raw}
    
    for c in leaves:
        calculate_depths(tree, c, root)
    
    return tree

def get_tourney_trees(tourney_slots):
    return { season : get_tourney_tree_one_season(tourney_slots, season)\
        for season in tourney_slots['Season'].unique() }

def slot_matchup_from_seed(tree, seed1, seed2):
    # return which slot the two teams would face off in
    if seed1 == seed2:
        return seed1
    next_seed1 = seed1 if tree[seed1]['depth'] < tree[seed2]['depth'] else tree[seed1]['parent']
    next_seed2 = seed2 if tree[seed2]['depth'] < tree[seed1]['depth'] else tree[seed2]['parent']
    return slot_matchup_from_seed(tree, next_seed1, next_seed2)

def get_team_seed(tourney_seeds, season, team):
    seed = tourney_seeds[
        (tourney_seeds['Team'] == team) & 
        (tourney_seeds['Season'] == season)
    ]['Seed'].values
    if len(seed) == 1:
        return seed[0]
    else:
        return None



In [9]:

    
def dist(play_lat, play_lng, lat, lng):
    return geodist((play_lat, play_lng), (lat, lng)).miles

def reg_distance_to_game(games_in, team_geog):
    
    games = games_in.copy()
    out_cols = list(games.columns) + ['w_dist', 'l_dist']
    
    w_geog = team_geog.rename(columns = {'lat' : 'w_lat', 'lng' : 'w_lng'})
    l_geog = team_geog.rename(columns = {'lat' : 'l_lat', 'lng' : 'l_lng'})
    games = games\
        .merge(w_geog, left_on = 'Wteam', right_on = 'team_id')\
        .merge(l_geog, left_on = 'Lteam', right_on = 'team_id')
    # handle neutral locations later by averaging distance from home for 2 teams if neutral location
    games['play_lat'] = np.where(games['Wloc'] == 'H', games['w_lat'], games['l_lat'])
    games['play_lng'] = np.where(games['Wloc'] == 'H', games['w_lng'], games['l_lng'])
    games['w_dist'] = games.apply(lambda x: dist(x['play_lat'], x['play_lng'], x['w_lat'], x['w_lng']), axis = 1)
    games['l_dist'] = games.apply(lambda x: dist(x['play_lat'], x['play_lng'], x['l_lat'], x['l_lng']), axis = 1)
    # correct for neutral
    games['w_dist'],  games['l_dist']  =\
        np.where(games['Wloc'] == 'N', (games['w_dist'] + games['l_dist'])/2, games['w_dist']),\
        np.where(games['Wloc'] == 'N', (games['w_dist'] + games['l_dist'])/2, games['l_dist'])
    return games[out_cols]

def tourney_distance_to_game(tourney_raw_in, tourney_geog, team_geog, season):
   
    out_cols = list(tourney_raw_in.columns) + ['dist_1', 'dist_2']

    tourney_raw = tourney_raw_in.copy()
    
    geog_1 = team_geog.rename(columns = {'lat' : 'lat_1', 'lng' : 'lng_1'})
    geog_2 = team_geog.rename(columns = {'lat' : 'lat_2', 'lng' : 'lng_2'})
    geog_play = tourney_geog[tourney_geog['season'] == season][['slot', 'lat', 'lng']]\
        .rename(columns = {'lat' : 'lat_p', 'lng' : 'lng_p'})
    
    tourney_raw = tourney_raw\
        .merge(geog_1, left_on = 'Team1', right_on = 'team_id')\
        .merge(geog_2, left_on = 'Team2', right_on = 'team_id')\
        .merge(geog_play, left_on = 'SlotMatchup', right_on = 'slot')
   
    tourney_raw['dist_1'] = tourney_raw.apply(lambda x: dist(x['lat_p'], x['lng_p'], x['lat_1'], x['lng_1']), axis = 1)
    tourney_raw['dist_2'] = tourney_raw.apply(lambda x: dist(x['lat_p'], x['lng_p'], x['lat_2'], x['lng_2']), axis = 1)
    
    return tourney_raw[out_cols]



In [10]:

    
def attach_elom(regular_raw, tourney_raw, n_seed = 3, tau_seed = 70, tau_feature = 35):
    # A bit of a beast and I might want to break this thing up
    # 1. Find regular season ELO momentum
    #    * seed by running on long history
    #    * run for one season with short history (tau_feature)
    # 2. Find in tourney
    #    * assume opponent is best that you would face
    #    * update elos between rounds to prevent unintended feedback loop
    #    * need to break out round x team then rejoin on team x team
    #    * if team at hand is strong seed, use second strongest
    
    # FIRST DO REGULAR SEASON
    
    rrc = regular_raw.sort_values(['Season', 'Daynum'])

    elo_map = { team : 0 for team in set(rrc['Team1']).union(set(rrc['Team2'])) }

    hscores = rrc[rrc['dist_1'] < rrc['dist_2']]['score_1'].tolist() + rrc[rrc['dist_1'] > rrc['dist_2']]['score_2'].tolist()
    ascores = rrc[rrc['dist_1'] > rrc['dist_2']]['score_1'].tolist() + rrc[rrc['dist_1'] < rrc['dist_2']]['score_2'].tolist()
    correction = { -1 : -(np.mean(hscores) - np.mean(ascores)), 0 : 0, 1 : np.mean(hscores) - np.mean(ascores) }

    rrc['eff_score_diff'] = (rrc['score_1'] - rrc['score_2']) +\
        (rrc['dist_1'] - rrc['dist_2']).map(lambda x: correction[int(np.sign(x))])

    # decay time: how much history to consider?
    # this can in theory be reduced during the tournament to emphasize recent performance
    taus = [ tau_seed ] * n_seed + [ tau_feature ]
    is_for_feature = [ False ] * n_seed + [ True ]

    # seed the season -- run it a few times -- should over time bake in strength of schedule and converge 
    elos_1, elos_2 = [], []
    for tau, is_feature in zip(taus, is_for_feature):
        for team1, team2, effdiff in rrc[['Team1', 'Team2', 'eff_score_diff']].values:
            elo_1, elo_2 = elo_map[team1], elo_map[team2]
            # store beginning of game status if it's time (i.e. done with elo seed)
            if is_feature:
                elos_1 += [ elo_1 ]
                elos_2 += [ elo_2 ]
            # update elos
            source_elo_change = np.sign(effdiff) * np.log(1 + abs(effdiff)) 
            elo_map[team1] = (elo_2 + source_elo_change) / tau + (1 - 1/tau) * elo_1
            elo_map[team2] = (elo_1 - source_elo_change) / tau + (1 - 1/tau) * elo_2


    rrc['elom_1'] = elos_1
    rrc['elom_2'] = elos_2
    
    # NOW DO TOURNEY

    # get top seeds per round (primary + secondary)
    # join on all possible teams playing at each round
    # sort by rounds and play it out against expected opponents to get elo estimates
    # rejoin estimates on tourney_raw

    avg_win_pts = rrc['eff_score_diff'].map(abs).mean() # some estimate of win mgn -- should this be humbled?

    teams_by_round = pd.concat([
        tourney_raw[['SlotMatchup', 'Team1', 'mean_rtg_1']].rename(columns = {'Team1':'Team_Id', 'mean_rtg_1':'mean_rtg'}),
        tourney_raw[['SlotMatchup', 'Team2', 'mean_rtg_2']].rename(columns = {'Team2':'Team_Id', 'mean_rtg_2':'mean_rtg'}),
    ])\
        .sort_values(['SlotMatchup', 'mean_rtg'])\
        .groupby(['SlotMatchup', 'Team_Id']).head(1).reset_index(drop = True)

    top_seeds_per_round = teams_by_round.groupby('SlotMatchup').head(2).reset_index(drop = True)
    top_seeds_per_round['Is_Primary'] = top_seeds_per_round.index % 2 == 0
    top_seeds_per_round = \
        top_seeds_per_round.pivot(index = 'SlotMatchup', columns = 'Is_Primary', values = 'Team_Id').reset_index()
    top_seeds_per_round = top_seeds_per_round.rename(columns = {False : 'Secondary', True : 'Primary'})
    top_seeds_per_round.columns.name = None
    top_seeds_per_round['Round'] = top_seeds_per_round['SlotMatchup']\
        .map(lambda x: 0 if len(x) != 4 else x[1]).map(int)

    momentum_elo_estimates = teams_by_round.join(top_seeds_per_round.set_index('SlotMatchup'), on = 'SlotMatchup')
    momentum_elo_estimates = momentum_elo_estimates.sort_values(['Round', 'SlotMatchup', 'Team_Id']).reset_index(drop = True)

    # sim the elo's but each round work on a temp copy to prevent bleed between iterations within round
    elos = []
    last_round = -1
    elo_map_updated = elo_map.copy()
    for team, primary, secondary, this_round in momentum_elo_estimates[['Team_Id', 'Secondary', 'Primary', 'Round']].values:
        if last_round != this_round:
            elo_map = elo_map_updated.copy()
        opponent = primary if team != primary else secondary
        elo, opp_elo = elo_map[team], elo_map[opponent]
        elos += [elo]
        source_elo_change = np.log(1 + avg_win_pts)
        #elo_map_updated[team] = (opp_elo + source_elo_change) / tau_feature + (1 - 1/tau_feature) * elo
        elo_map_updated[team] = max(opp_elo, elo)

    momentum_elo_estimates['elom'] = elos

    # attach to tourney_raw
    elom_1 = momentum_elo_estimates[['SlotMatchup', 'Team_Id', 'elom']]\
        .rename(columns = {'Team_Id' : 'Team1', 'elom' : 'elom_1'})
    elom_2 = momentum_elo_estimates[['SlotMatchup', 'Team_Id', 'elom']]\
        .rename(columns = {'Team_Id' : 'Team2', 'elom' : 'elom_2'})
    tourney_raw_with_elom = tourney_raw\
        .join(elom_1.set_index(['SlotMatchup', 'Team1']), on = ['SlotMatchup', 'Team1'])\
        .join(elom_2.set_index(['SlotMatchup', 'Team2']), on = ['SlotMatchup', 'Team2'])
        
    # OUTPUT
    
    return rrc.drop('eff_score_diff', 1), tourney_raw_with_elom



In [11]:

    
def get_raw_reg_season_data(reg_season, team_geog, season):
    
    cols_to_keep = ['Season', 'Daynum', 'Team1', 'Team2', 'score_1', 'score_2', 'posessions', 'dist_1', 'dist_2']
    
    rsr = reg_season[reg_season['Season'] == season] # reg season raw
    rsr = reg_distance_to_game(rsr, team_geog)
    
    rsr['Team1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['Wteam'], rsr['Lteam'])
    rsr['Team2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['Wteam'], rsr['Lteam'])
    rsr['score_1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['Wscore'], rsr['Lscore'])
    rsr['score_2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['Wscore'], rsr['Lscore'])
    rsr['dist_1'] = np.where(rsr['Wteam'] < rsr['Lteam'], rsr['w_dist'], rsr['l_dist'])
    rsr['dist_2'] = np.where(rsr['Wteam'] > rsr['Lteam'], rsr['w_dist'], rsr['l_dist'])
    # From kenpom: FGA-OR+TO+0.475xFTA
    rsr['posessions'] = 0.5 * (
        rsr['Wfga'] - rsr['Wor'] + rsr['Wto'] + 0.475*rsr['Wfta'] +\
        rsr['Lfga'] - rsr['Lor'] + rsr['Lto'] + 0.475*rsr['Lfta']
    )
    
    return rsr[cols_to_keep]

def get_raw_tourney_data(tourney_seeds, tourney_trees, tourney_geog, team_geog, season):
    
    # tree to find play location
    tree = tourney_trees[season]
    
    # get all teams in tourney
    seed_map = tourney_seeds[tourney_seeds['Season'] == season].set_index('Team').to_dict()['Seed']
    teams = sorted(seed_map.keys())
    
    team_pairs = sorted([ (team1, team2) for team1 in teams for team2 in teams if team1 < team2 ])
    tourney_raw = pd.DataFrame(team_pairs).rename(columns = { 0 : 'Team1', 1 : 'Team2' })
    tourney_raw['Season'] = season
    
    # find out where they would play each other
    tourney_raw['SlotMatchup'] = tourney_raw.apply(
        lambda x: slot_matchup_from_seed(tree, seed_map[x['Team1']], seed_map[x['Team2']]), axis = 1
    )
    
    # get features
    tourney_raw = tourney_distance_to_game(tourney_raw, tourney_geog, team_geog, season)
    
    return tourney_raw

def attach_supplements(data, reg_season, kenpom, ratings_eos, ratings_by_day, season, use_eos_rtg = False):
    
    dc = data.copy()
    if use_eos_rtg: # get rating diff stats
        dc = attach_ratings_diff_stats(dc, ratings_eos, season) 
    else:
        dc = attach_ratings_by_day_diff_stats(dc, ratings_by_day, season)
    dc = attach_kenpom_stats(dc, kenpom, season)
    dc = attach_score_fluctuations(dc, reg_season, season)
    
    return dc

Feature engineering

Log of distance
Capture rating diffs
Capture rating diffs acct for variance (t score)
Diff in expected scores via EM diffs

Tag winners in training set + viz. Also, normalize data.



In [12]:

    
def generate_features(df):
    
    has_score = 'score_1' in df.columns and 'score_2' in df.columns
    
    cols_to_keep = ['Team1', 'Team2', 'Season', 'ln_dist_diff',
                    'rtg_diff', 't_rtg', 'pt_diff', 't_score', 
                    'elom_diff'] +\
        (['Team1_win'] if has_score else [])
    
    features = df.copy()
    # use negative so that shorter travel distance has higher statistic
    features['ln_dist_diff'] = -np.log((1 + df['dist_1'])/(1 + df['dist_2']))
    features['rtg_diff'] = (df['mean_rtg_1'] - df['mean_rtg_2']) 
    features['t_rtg'] =  features['rtg_diff'] / np.sqrt(df['std_rtg_1']**2 + df['std_rtg_2']**2)
    features['pt_diff'] = df['adjem_1'] - df['adjem_2']
    features['t_score'] = (df['adjem_1'] - df['adjem_2']) / np.sqrt(df['std_mgn_1']**2 + df['std_mgn_2']**2)
    # elom is supposed to be a humbler -- do nothing with big diff, do a lot (humble) with a small diff?
    features['elom_diff'] = df['elom_1'] - df['elom_2']
    
    # truth feature: did team 1 win?
    if has_score:
        features['Team1_win'] = features['score_1'] > features['score_2']
    
    return features[cols_to_keep]

def normalize_features(train, test, features):
    all_data_raw = pd.concat([train[features], test[features]])
    all_data_norm = skpp.scale(all_data_raw, with_mean = False) # with_mean = False ?
    train_norm = train.copy()
    test_norm = test.copy()
    train_norm[features] = all_data_norm[:len(train)]
    test_norm[features] = all_data_norm[len(train):]
    return train_norm, test_norm



In [13]:

    
def get_key(df):
    return df['Season'].map(str) + '_' + df['Team1'].map(str) + '_' + df['Team2'].map(str)

Running the model



In [14]:

    
features_to_use = ['ln_dist_diff', 'rtg_diff', 't_rtg', 'pt_diff', 't_score', 'elom_diff']
#features_to_use = ['ln_dist_diff', 'rtg_diff', 't_rtg', 'pt_diff', 't_score']
#features_to_use = ['ln_dist_diff', 'rtg_diff', 't_rtg', 't_score', 'elom_diff']
#features_to_use = ['ln_dist_diff', 't_rtg', 't_score', 'elom_diff']
#features_to_use = ['ln_dist_diff', 'pt_diff', 't_score',  'elom_diff']
#features_to_use = ['ln_dist_diff', 'rtg_diff', 't_rtg', 'elom_diff']
#features_to_use = ['elom_diff']
predict_field = 'Team1_win'



In [15]:

    
ratings_by_day = get_ratings_by_day(ratings)



In [ ]:

    
def get_features(season, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog):
    
    # support data
    tourney_trees = get_tourney_trees(tourney_slots)
    ratings_eos = get_eos_ratings(ratings)
    #ratings_by_day = get_ratings_by_day(ratings)
    
    # regular season cleaned data
    regular_raw = get_raw_reg_season_data(reg_season, team_geog, season)
    regular_raw =\
        attach_supplements(regular_raw, reg_season, kenpom, ratings_eos, ratings_by_day, season, use_eos_rtg = False)
    
    # post season cleaned data
    tourney_raw = get_raw_tourney_data(tourney_seeds, tourney_trees, tourney_geog, team_geog, season)
    tourney_raw =\
        attach_supplements(tourney_raw, reg_season, kenpom, ratings_eos, ratings_by_day, season, use_eos_rtg = True)
    
    # attach elo momentum
    # -- this guy is different from everything else because it uses reg season to inform tourney
    # -- therefore it should be treated appropriately and differently
    regular_raw, tourney_raw = attach_elom(regular_raw, tourney_raw)
    
    # get and normalize features
    feat_train = generate_features(regular_raw)
    feat_test = generate_features(tourney_raw)
    train_norm, test_norm = normalize_features(feat_train, feat_test, features_to_use)
    # playing with artificially making these coefficients higher (since lr overfits to score -- it == win)
    
    return regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm

def make_predictions(season, train_norm, test_norm, tourney, C1 = 1, C2 = 1):
    
    # fit
    lr1 = sklm.LogisticRegression(C = C1, fit_intercept = False) # fit_intercept = False?
    lr2 = sklm.LogisticRegression(C = C2, fit_intercept = False)
    lr1.fit(train_norm[features_to_use].values, train_norm[predict_field].values)
    lr2.fit(train_norm[features_to_use].values, train_norm[predict_field].values)

    # predictions
    tourney_round = tourney_raw['SlotMatchup'].map(lambda slot: 0 if len(slot) < 3 else int(slot[1])).values
    probs1 = lr1.predict_proba(test_norm[tourney_round <= 1][features_to_use].values) 
    probs2 = lr2.predict_proba(test_norm[tourney_round >  1][features_to_use].values) 
    keys1 = get_key(test_norm[tourney_round <= 1])
    keys2 = get_key(test_norm[tourney_round >  1])
    predictions_with_round_id = pd.concat([
        pd.DataFrame({'Id' : keys1.values, 'Pred' : probs1[:,1], 'first_round' : True}),
        pd.DataFrame({'Id' : keys2.values, 'Pred' : probs2[:,1], 'first_round' : False})
    ])
    predictions = predictions_with_round_id[['Id', 'Pred']]
    
    if season in tourney['Season'].values:
        # Evaluate outcomes
        res_base = tourney[(tourney['Season'] == season) & (tourney['Daynum'] > 135)].copy().reset_index()
        res_base['Team1'] = np.where(res_base['Wteam'] < res_base['Lteam'], res_base['Wteam'], res_base['Lteam'])
        res_base['Team2'] = np.where(res_base['Wteam'] > res_base['Lteam'], res_base['Wteam'], res_base['Lteam'])
        res_base['Result'] = (res_base['Wteam'] == res_base['Team1']).map(lambda x: 1 if x else 0)
        res_base['Id'] = get_key(res_base) 
        # attach results to predictions
        res = pd.merge(res_base[['Id', 'Result']], predictions_with_round_id, on = 'Id', how = 'left')
        res1 = res[res['first_round'] == True]
        res2 = res[res['first_round'] == False]
        # logloss
        ll  = skm.log_loss(res['Result'], res['Pred'])
        ll1 = skm.log_loss(res1['Result'], res1['Pred'])
        ll2 = skm.log_loss(res2['Result'], res2['Pred'])
    else:
        res = ll = ll1 = ll2 = None
    
    return predictions, res, [ll, ll1, ll2], [lr1, lr2]



In [ ]:

    
all_predictions = []
for season in [2014, 2015, 2016]:
    regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm = \
        get_features(season, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog)
    # see below for choice of C
    predictions, res, lls, lrs = make_predictions(season, train_norm, test_norm, tourney, C1 = 7e-3, C2 = 5e0)
    print(lls, lrs[0].coef_)
    all_predictions += [predictions]









    



[0.56886081537858457, 0.45381498119959052, 0.68761780549883633] [[ 0.42351311  0.05303248 -0.00191162  0.64102356  0.62500986  0.21059879]]



In [ ]:

    
# 0.587211080951
# 0.564406324468
# 0.493727883134
# 0.531440236192



In [18]:

    
pd.concat(all_predictions).to_csv('./submissions/logisticModel_regSeason_7e-3_5e0_dist_rtg_score_20170315.csv', index = False)



In [173]:

    
regular_raw[train_norm['t_rtg'] < -7]









    Out[173]:






  
    
      
      Season
      Daynum
      Team1
      Team2
      score_1
      score_2
      posessions
      dist_1
      dist_2
      mean_rtg_1
      ...
      std_rtg_2
      num_rtg_2
      adjem_1
      adjem_2
      adjt_1
      adjt_2
      std_mgn_1
      std_mgn_2
      elom_1
      elom_2
    
  
  
    
      1482
      2016
      11
      1299
      1345
      40
      81
      69.3750
      489.301506
      0.0
      333.896552
      ...
      4.360173
      62
      -18.9160
      24.6654
      68.2101
      68.2777
      18.815393
      21.857408
      -1.012956
      1.364967
    
    
      1285
      2016
      37
      1271
      1277
      35
      78
      66.0125
      557.961678
      0.0
      316.551724
      ...
      2.710859
      62
      -15.2109
      29.5834
      67.9338
      67.3384
      19.832705
      22.683973
      -1.324791
      2.234999
    
    
      1565
      2016
      50
      1175
      1428
      58
      105
      71.0500
      1915.896870
      0.0
      347.103448
      ...
      7.217553
      62
      -24.5365
      18.0909
      67.2308
      66.4873
      19.087075
      24.477933
      -2.222088
      1.363091
    
    
      4993
      2016
      51
      1148
      1163
      52
      99
      74.4000
      28.949304
      0.0
      349.637931
      ...
      6.818984
      59
      -26.7756
      18.3083
      68.3637
      66.6498
      16.391800
      21.708097
      -2.366199
      1.313501
    
  

4 rows × 23 columns



In [20]:

    
sns.pairplot(train_norm, hue = predict_field, vars = features_to_use)
plt.show()

Sandbox explorations



In [239]:

    
res.ix[np.argsort(-(res['Pred'] - res['Result']).abs())].reset_index(drop = True)









    Out[239]:






  
    
      
      Id
      Result
      Pred
    
  
  
    
      0
      2016_1277_1292
      0
      0.926133
    
    
      1
      2016_1393_1438
      1
      0.163649
    
    
      2
      2016_1143_1218
      0
      0.758979
    
    
      3
      2016_1114_1345
      1
      0.247886
    
    
      4
      2016_1372_1452
      1
      0.253782
    
    
      5
      2016_1458_1462
      1
      0.297508
    
    
      6
      2016_1320_1400
      1
      0.300942
    
    
      7
      2016_1242_1437
      0
      0.620614
    
    
      8
      2016_1231_1246
      1
      0.427953
    
    
      9
      2016_1124_1463
      0
      0.569694
    
    
      10
      2016_1211_1393
      0
      0.563849
    
    
      11
      2016_1112_1455
      0
      0.551972
    
    
      12
      2016_1211_1428
      1
      0.453506
    
    
      13
      2016_1314_1437
      0
      0.530796
    
    
      14
      2016_1173_1393
      0
      0.528484
    
    
      15
      2016_1153_1386
      0
      0.525397
    
    
      16
      2016_1328_1332
      1
      0.513272
    
    
      17
      2016_1338_1458
      0
      0.486382
    
    
      18
      2016_1211_1371
      1
      0.520524
    
    
      19
      2016_1323_1458
      1
      0.524347
    
    
      20
      2016_1344_1425
      1
      0.533233
    
    
      21
      2016_1323_1372
      1
      0.541952
    
    
      22
      2016_1328_1437
      0
      0.444530
    
    
      23
      2016_1276_1323
      0
      0.444343
    
    
      24
      2016_1139_1403
      1
      0.590411
    
    
      25
      2016_1274_1455
      1
      0.598120
    
    
      26
      2016_1160_1163
      0
      0.393382
    
    
      27
      2016_1181_1332
      0
      0.387228
    
    
      28
      2016_1328_1401
      1
      0.619735
    
    
      29
      2016_1333_1433
      0
      0.363056
    
    
      ...
      ...
      ...
      ...
    
    
      33
      2016_1292_1393
      0
      0.295919
    
    
      34
      2016_1231_1314
      0
      0.293655
    
    
      35
      2016_1235_1438
      0
      0.260924
    
    
      36
      2016_1234_1396
      1
      0.741763
    
    
      37
      2016_1332_1386
      1
      0.749091
    
    
      38
      2016_1218_1268
      0
      0.247900
    
    
      39
      2016_1268_1355
      1
      0.759495
    
    
      40
      2016_1234_1437
      0
      0.228492
    
    
      41
      2016_1151_1231
      0
      0.216956
    
    
      42
      2016_1181_1423
      1
      0.789615
    
    
      43
      2016_1201_1428
      0
      0.200167
    
    
      44
      2016_1320_1401
      0
      0.171157
    
    
      45
      2016_1233_1235
      0
      0.168415
    
    
      46
      2016_1328_1433
      1
      0.833722
    
    
      47
      2016_1314_1323
      1
      0.833816
    
    
      48
      2016_1246_1392
      1
      0.837882
    
    
      49
      2016_1139_1438
      0
      0.161033
    
    
      50
      2016_1314_1393
      1
      0.844950
    
    
      51
      2016_1242_1268
      1
      0.870685
    
    
      52
      2016_1314_1344
      1
      0.900806
    
    
      53
      2016_1163_1242
      0
      0.094444
    
    
      54
      2016_1451_1462
      0
      0.092781
    
    
      55
      2016_1138_1274
      0
      0.075859
    
    
      56
      2016_1401_1453
      1
      0.930645
    
    
      57
      2016_1167_1328
      0
      0.067718
    
    
      58
      2016_1421_1437
      0
      0.043569
    
    
      59
      2016_1214_1438
      0
      0.037007
    
    
      60
      2016_1221_1332
      0
      0.029815
    
    
      61
      2016_1195_1314
      0
      0.013139
    
    
      62
      2016_1122_1242
      0
      0.007319
    
  

63 rows × 3 columns



In [136]:

    
# accuracy?
np.sum(np.where(res['Pred'] > 0.5, res['Result'] == 1, res['Result'] == 0)) / len(res)









    Out[136]:





0.76190476190476186

Effect of C on different years



In [19]:

    
cs_to_check = np.power(10, np.arange(-4, 2 + 1e-9, 0.1))
years_to_check = range(2011, 2017)
c_effect_df_dict = { 'C' : cs_to_check }
c_effect_df_dict_first_round = c_effect_df_dict.copy()
c_effect_df_dict_later_round = c_effect_df_dict.copy()
for yr in years_to_check:
    regular_raw, tourney_raw, feat_train, feat_test, train_norm, test_norm = \
        get_features(yr, tourney_slots, ratings, reg_season, team_geog, kenpom, tourney_seeds, tourney_geog)
    log_losses_components = [ make_predictions(yr, train_norm, test_norm, tourney, C1 = C, C2 = C)[2]\
                                  for C in cs_to_check ]
    log_losses = [ ll[0] for ll in log_losses_components ]
    log_losses_first_round = [ ll[1] for ll in log_losses_components ]
    log_losses_later_round = [ ll[2] for ll in log_losses_components ]
    c_effect_df_dict[str(yr)] = log_losses
    c_effect_df_dict_first_round[str(yr)] = log_losses_first_round
    c_effect_df_dict_later_round[str(yr)] = log_losses_later_round
c_effect = pd.DataFrame(c_effect_df_dict)
c_effect_first_round = pd.DataFrame(c_effect_df_dict_first_round)
c_effect_later_round = pd.DataFrame(c_effect_df_dict_later_round)
c_effect_all = c_effect\
    .join(c_effect_first_round.set_index('C'), on = 'C', rsuffix = '_first_rnd')\
    .join(c_effect_later_round.set_index('C'), on = 'C', rsuffix = '_later_rnd')



In [23]:

    
plt.semilogx()
for col in [ col for col in c_effect_all if col != 'C' and 'first' in col ]:
    plt.plot(c_effect['C'], c_effect_all[col])
plt.legend(loc = 3)
plt.xlabel('C')
plt.ylabel('logloss')
plt.ylim(0.4, 0.75)
plt.show()



In [60]:

    
c_effect.min()









    Out[60]:





2013    0.577781
2014    0.565587
2015    0.467788
2016    0.530481
C       0.000100
dtype: float64

PCA on features used



In [ ]:

Look at who is contributing to logloss



In [120]:

    
# contribution to logloss
rc = res.copy()
ftc = feat_test.copy()
ftc['Id'] = get_key(ftc)
rc['logloss_contrib'] = -np.log(np.where(rc['Result'] == 1, rc['Pred'], 1 - rc['Pred'])) / len(rc)
ftc = pd.merge(rc, ftc, how = 'left', on = 'Id')

fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (10, 4))
im = axes[0].scatter(ftc['t_score'], ftc['t_rtg'], c = ftc['logloss_contrib'], vmin = 0, vmax = 0.025, cmap = plt.cm.get_cmap('coolwarm'))
axes[0].set_xlabel('t_score')
axes[0].set_ylabel('t_rtg')
#plt.colorbar(sc)
axes[1].scatter(-ftc['ln_dist_diff'], ftc['t_rtg'], c = ftc['logloss_contrib'], vmin = 0, vmax = 0.025, cmap = plt.cm.get_cmap('coolwarm'))
axes[1].set_xlabel('ln_dist_diff')
cb = fig.colorbar(im, ax=axes.ravel().tolist(), label = 'logloss_contrib')
plt.show()

Logloss contribution by round



In [121]:

    
tourney_rounds = tourney_raw[['Team1', 'Team2', 'Season', 'SlotMatchup']].copy()
tourney_rounds['Id'] = get_key(tourney_rounds)
tourney_rounds['round'] = tourney_rounds['SlotMatchup'].map(lambda s: int(s[1]))
tourney_rounds = tourney_rounds[['Id', 'round']]
ftc_with_rounds = pd.merge(ftc, tourney_rounds, how = 'left', on = 'Id')

fig, axs = plt.subplots(ncols=2, figsize = (10, 4))
sns.barplot(data = ftc_with_rounds, x = 'round', y = 'logloss_contrib', errwidth = 0, ax = axs[0])
sns.barplot(data = ftc_with_rounds, x = 'round', y = 'logloss_contrib', errwidth = 0, estimator=max, ax = axs[1])
axs[0].set_ylim(0, 0.015)
axs[1].set_ylim(0, 0.06)
plt.show()

A look at dynamics of ratings data



In [321]:

    
sns.lmplot('mean_rtg', 'std_rtg', data = ratings_eos, fit_reg = False)
plt.show()



In [353]:

    
ratings_eos_test = ratings_eos.copy()
ratings_eos_test['parabola_mean_model'] =(ratings_eos_test['mean_rtg'].max()/2)**2-(ratings_eos_test['mean_rtg'] - ratings_eos_test['mean_rtg'].max()/2)**2
sns.lmplot('parabola_mean_model', 'std_rtg', data = ratings_eos_test, fit_reg = False)
plt.show()



In [352]:

    
test_data_test = test_data.copy()
test_data_test['rtg_diff'] = test_data_test['mean_rtg_1'] - test_data_test['mean_rtg_2']
test_data_test['t_model'] = test_data_test['rtg_diff']/(test_data_test['std_rtg_1']**2 + test_data_test['std_rtg_2']**2)**0.5
#sns.lmplot('rtg_diff', 't_model', data = test_data_test, fit_reg = False)
sns.pairplot(test_data_test[['rtg_diff', 't_model']])
plt.show()

Quick investigation: looks like avg score decreases with log of distance traveled



In [128]:

    
dist_test = get_training_data(reg_season, team_geog, 2016)
w_dist_test = dist_test[['w_dist', 'Wscore']].rename(columns = {'w_dist' : 'dist', 'Wscore' : 'score'})
l_dist_test = dist_test[['l_dist', 'Lscore']].rename(columns = {'l_dist' : 'dist', 'Lscore' : 'score'})
dist_test = pd.concat([w_dist_test, l_dist_test]).reset_index()[['dist', 'score']]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-128-167cca87b052> in <module>()
----> 1 dist_test = get_training_data(reg_season, team_geog, 2016)
      2 w_dist_test = dist_test[['w_dist', 'Wscore']].rename(columns = {'w_dist' : 'dist', 'Wscore' : 'score'})
      3 l_dist_test = dist_test[['l_dist', 'Lscore']].rename(columns = {'l_dist' : 'dist', 'Lscore' : 'score'})
      4 dist_test = pd.concat([w_dist_test, l_dist_test]).reset_index()[['dist', 'score']]

NameError: name 'get_training_data' is not defined



In [ ]:

    
plt.hist(dist_test['dist'])
plt.xlim(0, 3000)
plt.semilogy()
plt.show()



In [ ]:

    
bucket_size = 1
dist_test['bucket'] = bucket_size * (np.log(dist_test['dist'] + 1) // bucket_size)
dist_grp = dist_test.groupby('bucket').aggregate([np.mean, np.std, len])['score']
dist_grp['err'] = dist_grp['std'] / np.sqrt(dist_grp['len'])



In [ ]:

    
plt.plot(dist_grp['mean'])
plt.fill_between(dist_grp.index, 
                 (dist_grp['mean'] - 2*dist_grp['err']).values, 
                 (dist_grp['mean'] + 2*dist_grp['err']).values,
                 alpha = 0.3)
plt.xlabel('log of distance traveled')
plt.ylabel('avg score')
plt.show()

	Season	Daynum	Team1	Team2	score_1	score_2	posessions	dist_1	mean_rtg_1	...	std_rtg_2	num_rtg_2	adjem_1	adjem_2	adjt_1	adjt_2	std_mgn_1	std_mgn_2	elom_1	elom_2
1482	2016	11	1299	1345	40	81	69.3750	489.301506	333.896552	...	4.360173	62	-18.9160	24.6654	68.2101	68.2777	18.815393	21.857408	-1.012956	1.364967
1285	2016	37	1271	1277	35	78	66.0125	557.961678	316.551724	...	2.710859	62	-15.2109	29.5834	67.9338	67.3384	19.832705	22.683973	-1.324791	2.234999
1565	2016	50	1175	1428	58	105	71.0500	1915.896870	347.103448	...	7.217553	62	-24.5365	18.0909	67.2308	66.4873	19.087075	24.477933	-2.222088	1.363091
4993	2016	51	1148	1163	52	99	74.4000	28.949304	349.637931	...	6.818984	59	-26.7756	18.3083	68.3637	66.6498	16.391800	21.708097	-2.366199	1.313501

	Id	Result	Pred
0	2016_1277_1292	0	0.926133
1	2016_1393_1438	1	0.163649
2	2016_1143_1218	0	0.758979
3	2016_1114_1345	1	0.247886
4	2016_1372_1452	1	0.253782
5	2016_1458_1462	1	0.297508
6	2016_1320_1400	1	0.300942
7	2016_1242_1437	0	0.620614
8	2016_1231_1246	1	0.427953
9	2016_1124_1463	0	0.569694
10	2016_1211_1393	0	0.563849
11	2016_1112_1455	0	0.551972
12	2016_1211_1428	1	0.453506
13	2016_1314_1437	0	0.530796
14	2016_1173_1393	0	0.528484
15	2016_1153_1386	0	0.525397
16	2016_1328_1332	1	0.513272
17	2016_1338_1458	0	0.486382
18	2016_1211_1371	1	0.520524
19	2016_1323_1458	1	0.524347
20	2016_1344_1425	1	0.533233
21	2016_1323_1372	1	0.541952
22	2016_1328_1437	0	0.444530
23	2016_1276_1323	0	0.444343
24	2016_1139_1403	1	0.590411
25	2016_1274_1455	1	0.598120
26	2016_1160_1163	0	0.393382
27	2016_1181_1332	0	0.387228
28	2016_1328_1401	1	0.619735
29	2016_1333_1433	0	0.363056
...	...	...	...
33	2016_1292_1393	0	0.295919
34	2016_1231_1314	0	0.293655
35	2016_1235_1438	0	0.260924
36	2016_1234_1396	1	0.741763
37	2016_1332_1386	1	0.749091
38	2016_1218_1268	0	0.247900
39	2016_1268_1355	1	0.759495
40	2016_1234_1437	0	0.228492
41	2016_1151_1231	0	0.216956
42	2016_1181_1423	1	0.789615
43	2016_1201_1428	0	0.200167
44	2016_1320_1401	0	0.171157
45	2016_1233_1235	0	0.168415
46	2016_1328_1433	1	0.833722
47	2016_1314_1323	1	0.833816
48	2016_1246_1392	1	0.837882
49	2016_1139_1438	0	0.161033
50	2016_1314_1393	1	0.844950
51	2016_1242_1268	1	0.870685
52	2016_1314_1344	1	0.900806
53	2016_1163_1242	0	0.094444
54	2016_1451_1462	0	0.092781
55	2016_1138_1274	0	0.075859
56	2016_1401_1453	1	0.930645
57	2016_1167_1328	0	0.067718
58	2016_1421_1437	0	0.043569
59	2016_1214_1438	0	0.037007
60	2016_1221_1332	0	0.029815
61	2016_1195_1314	0	0.013139
62	2016_1122_1242	0	0.007319