In [1]:
import os
import pandas as pd
import numpy as np
import pymc3 as pm
from sklearn.metrics import log_loss, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
base_dir = os.path.join("/Users", "sbussmann", "Development", "buda", "buda-ratings")
interim_dir = os.path.join(base_dir, 'data', 'interim')
In [3]:
league_id = 40264
winloss = pd.read_csv(os.path.join(interim_dir, "winloss_divprior_{}.csv".format(league_id)))
In [4]:
winloss.head()
Out[4]:
In [5]:
winloss.shape
Out[5]:
In [6]:
teams = set(np.append(winloss['Team A'].unique(), winloss['Team B'].unique()))
In [7]:
rating_prior_div = {
'4/3 Div 1': 3.0,
'4/3 Div 2': 0.0,
'4/3 Div 3': -1.0,
'5/2 Div 1': 3.0,
'5/2 Div 2': 0.0,
'5/2 Div 3': -1.0
}
alphas = []
for i in range(len(teams)):
if i in winloss['Index A'].values:
index = winloss['Index A'] == i
div = winloss.loc[index, 'Div A'].unique()[0]
alpha = rating_prior_div[div]
alphas.append(alpha)
else:
index = winloss['Index B'] == i
div = winloss.loc[index, 'Div B'].unique()[0]
alpha = rating_prior_div[div]
alphas.append(alpha)
In [9]:
teamA = winloss['Index A'].values
teamB = winloss['Index B'].values
with pm.Model() as model:
# sharpness = pm.HalfStudentT('sharpness', sd=2.5, nu=3)
ratings = pm.Normal('ratings', mu=alphas, shape=len(teams))
deltaRating = ratings[teamA] - ratings[teamB]
p = 1 / (1 + np.exp(-deltaRating))
win = pm.Bernoulli('win', p, observed=winloss['Team A Wins'].values)
In [11]:
with model:
trace = pm.sample(1000)
In [44]:
# meanratings = trace.get_values('ratings').mean(axis=0)
meanratings = np.percentile(trace.get_values('ratings'), 50, axis=0)
In [13]:
names = []
for i, meanrating in enumerate(meanratings):
if i in winloss['Index A'].values:
index = winloss['Index A'] == i
name = winloss.loc[index, 'Team A'].unique()[0]
else:
index = winloss['Index B'] == i
name = winloss.loc[index, 'Team B'].unique()[0]
names.append(name)
# print("{}: {:.3f}".format(name, meanrating))
In [14]:
ratingsdf = pd.DataFrame({
'mean_rating': meanratings
}, index=names)
In [15]:
plusminus = pd.read_csv(os.path.join(interim_dir, 'plusminus_{}.csv'.format(league_id)), index_col='Team Name')
In [16]:
totaldf = plusminus.join(ratingsdf, how='inner')
In [18]:
totaldf.sort_values('mean_rating', ascending=False)
Out[18]:
In [19]:
totaldf.groupby('divname').mean()
Out[19]:
In [20]:
trace['ratings'].shape
Out[20]:
In [21]:
def get_index(team_name, df):
team_index = df.loc[team_name, 'Index']
return team_index
In [22]:
def calc_prob(indexA, indexB, trace):
# sharpness = trace['sharpness']
deltaRating = trace['ratings'][:, indexA] - trace['ratings'][:, indexB]
probAWins = 1. / (1 + np.exp(-deltaRating))
return probAWins.mean()
In [23]:
simprobs = calc_prob(0, 1, trace)
In [24]:
simprobs
Out[24]:
In [25]:
winloss_with_dup = pd.read_csv(os.path.join(interim_dir, 'winloss_with_duplicates.csv'))
In [26]:
winloss_with_dup['predicted'] = winloss_with_dup.apply(lambda x: calc_prob(x['Index A'], x['Index B'], trace), axis=1)
In [27]:
winloss_with_dup.head()
Out[27]:
In [28]:
winloss_with_dup['predBin'] = winloss_with_dup['predicted'].apply(lambda x: np.round(x, 1))
In [29]:
winloss_with_dup.head()
Out[29]:
In [30]:
binned = winloss_with_dup.groupby('predBin').mean()
binnedstd = winloss_with_dup.groupby('predBin').std()
binnedhi = binned + binnedstd
binnedlo = binned - binnedstd
In [31]:
sns.set_context('talk')
f, axes = plt.subplots(1, 2, figsize=(13, 6))
ax = axes[0]
ax.plot(winloss_with_dup['predicted'], winloss_with_dup['Team A Wins'], 'o', alpha=0.1)
ax.set_ylabel('Actual Result for Team A')
ax.set_xlabel('Predicted Team A Winning Percentage')
ax = axes[1]
ax.plot(binned['Team A Wins'], 'o-', color='salmon')
ax.fill_between(binned.index, binnedlo['Team A Wins'], binnedhi['Team A Wins'], alpha=0.3, color='salmon')
ax.set_ylim([0, 1])
ax.set_ylabel('Actual Team A Winning Percentage by Bin')
ax.set_xlabel('Predicted Team A Winning Percentage by Bin')
plt.tight_layout()
In [32]:
team_log_loss = winloss_with_dup.groupby('Team A').apply(
lambda x: log_loss(x['Team A Wins'].astype('int'), x['predicted'], labels=[0, 1]))
In [33]:
team_accuracy = winloss_with_dup.groupby('Team A').apply(
lambda x: accuracy_score(x['Team A Wins'].astype('int'), np.round(x['predicted'])))
In [34]:
sns.distplot(team_accuracy, kde=False, bins=15)
Out[34]:
In [35]:
sns.distplot(team_log_loss, kde=False, bins=15)
Out[35]:
In [37]:
log_loss(winloss_with_dup['Team A Wins'].astype('int'), winloss_with_dup['predicted'])
Out[37]:
In [38]:
accuracy_score(winloss_with_dup['Team A Wins'].astype('int'), np.round(winloss_with_dup['predicted']))
Out[38]:
In [39]:
def inspect(full_outcomes, team_name):
index = full_outcomes['Team A'] == team_name
outcomes_index = full_outcomes[index]
return outcomes_index.drop(['divname', 'Index A', 'Index B', 'predBin'], axis=1)
In [40]:
ins = inspect(winloss_with_dup, 'Injustice League')
ins
Out[40]:
In [41]:
ins = inspect(winloss_with_dup, 'Gothrilla')
ins
Out[41]:
In [42]:
ins = inspect(winloss_with_dup, 'SnakeCountryBromance')
ins
Out[42]:
In [43]:
ins = inspect(winloss_with_dup, 'Store Bought Dirt')
ins
Out[43]:
In [ ]: