In [87]:
import pandas as pd
import numpy as np
from scipy.stats import norm, bernoulli
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
Generate teams and Data with timevarying skill
In [281]:
true_team_skill = {
'A': [[1200, 100], [1300, 90], [1650, 150]],
'B': [[1100, 200], [1100, 200], [1200, 200]],
'C': [[800, 100], [1100, 300], [500, 120]],
'D': [[850, 120], [700, 500], [1200, 500]],
'E': [[700, 500], [400, 800], [900, 600]],
}
beta = 50
num_matches = 1000
n_teams = len(true_team_skill)
match_ups = np.random.choice(list(true_team_skill.keys()), (num_matches, 2), p=[0.1, 0.25, 0.25, 0.35, 0.05])
match_ups = match_ups[match_ups[:,0] != match_ups[:,1]]
print("%i Number of Generated Matchups" % len(match_ups))
num_matches = len(match_ups)
p_time = np.round(np.array([0.2, 0.5, 0.3])*num_matches).astype(int)
t = np.array([0]*p_time[0] + [1]*p_time[1] + [2]*p_time[2])
print("%s Time set matches -- 0: %i 1: %i 2: %i" % ((len(t),)+tuple(t for t in p_time)))
match_ups = np.column_stack([match_ups,t])
Now let's generate three set of time seried matches
In [282]:
winner = []
for match in match_ups:
param_one = true_team_skill[match[0]][int(match[2])]
param_two = true_team_skill[match[1]][int(match[2])]
p_one = 1.-norm.cdf(0, loc=param_one[0]-param_two[0], scale=np.sqrt(param_one[1]**2 + param_two[1]**2 + 2*beta**2))
res = bernoulli.rvs(p_one)
#print('%s vs %s - p: %.1f - %s won' % (match[0], match[1], p_one*100., match[int(np.logical_not(res))]))
winner.append(match[int(np.logical_not(res))])
In [283]:
obs = pd.DataFrame(match_ups, columns=['Team 1', 'Team 2', 't'])
obs['t'] = obs['t'].astype(int)
obs['winner'] = winner
obs.head()
Out[283]:
In [300]:
def vec2dict(s, n_teams, n_periods):
return {
'mu': np.array(s[:n_teams]),
'sigma': np.array(s[n_teams:n_teams*2]),
'alpha': np.array(s[n_teams*2:n_periods*n_teams+n_teams*2]).reshape(n_periods, n_teams),
'beta': s[-2],
'omega': s[-1]
}
def dict2vec(s):
return s['mu'].tolist()+s['sigma'].tolist()+s['alpha'].flatten().tolist()+[s['beta'], s['omega']]
In [319]:
skills_0 = dict2vec({
'mu': np.array([1000]*n_teams),
'sigma': np.array([150]*n_teams),
'alpha': np.zeros(shape=(n_periods, n_teams)),
'beta': 50,
'omega': 150,
})
In [322]:
from scipy.optimize import minimize
n_periods = obs['t'].max()
mapping = {v:k for k,v in dict(enumerate(['A', 'B', 'C', 'D', 'E'])).items()}
def loglike(y,p):
return -1.*(np.sum(y*np.log(p)+(1.-y)*np.log(1-p)))
def obj(skills):
s = vec2dict(skills, n_teams, n_periods)
ll = 0
for i in range(n_periods+1):
obs_per = obs[obs['t'] == i]
s['mu'] = s['mu'] + s['alpha'][i-1] if i > 0 else s['mu']
sigma_sq = sigma_sq + s['omega']**2 if i > 0 else s['sigma']**2
mean_diff = s['mu'][obs_per['Team 1'].map(mapping).values] - s['mu'][obs_per['Team 2'].map(mapping).values]
var_diff = sigma_sq[obs_per['Team 1'].map(mapping).values] + sigma_sq[obs_per['Team 2'].map(mapping).values] + 2.*s['beta']**2
p = 1.-norm.cdf(0., loc=mean_diff, scale = np.sqrt(var_diff))
ll += loglike((obs_per['Team 1'] == obs_per['winner']).values, p)
return ll
In [323]:
obj(skills_0)
Out[323]:
In [328]:
g = minimize(obj, x0=skills_0, constraints = {'type': 'ineq', 'fun': lambda x: x[5:10]})
print(g.message)
print(g.fun)
In [329]:
opt_skill = vec2dict(g.x, n_teams, n_periods)
print(opt_skill)
plots = norm.rvs(opt_skill['mu'], opt_skill['sigma'], size=(5000,5))
f, ax = plt.subplots(figsize=(12,8))
plt.ylim(0,0.01)
[sns.kdeplot(plots[:,i], shade=True, alpha=0.55, legend=True, ax=ax, label=i) for i in range(5)]
Out[329]:
In [330]:
cur_skill = opt_skill['mu']
skill_over_time = [opt_skill['mu']]
for i in range(n_periods):
skill_over_time.append(cur_skill*opt_skill['alpha'][i])
In [333]:
plt.plot(np.stack(skill_over_time))
Out[333]:
In [245]:
skill_over_time
Out[245]:
In [ ]:
In [548]:
infer_mean = opt_skill[:5]
infer_std = opt_skill[5:-1]
infer_beta = opt_skill[-1]
err = {'mcmc': [], 'opt': []}
for pair, k in obs.groupby(['Team 1', 'Team 2']).count().itertuples():
param_one = true_team_skill[pair[0]]
param_two = true_team_skill[pair[1]]
p_one_true = 1.-norm.cdf(0, loc=param_one[0]-param_two[0], scale=np.sqrt(param_one[1]**2 + param_two[1]**2 + 50.**2))
p_one_opt = 1.-norm.cdf(0, loc=infer_mean[mapping[pair[0]]]-infer_mean[mapping[pair[1]]], scale=np.sqrt(infer_std[mapping[pair[0]]]**2 + infer_std[mapping[pair[1]]]**2 + infer_beta**2))
p_one_mcmc = (1./(1+np.exp(-0.005*(trace['rating'][:,mapping[pair[0]]] - trace['rating'][:,mapping[pair[1]]])))).mean()
err['mcmc'].append(p_one_true-p_one_mcmc); err['opt'].append(p_one_true-p_one_opt);
print('%s vs %s : true - %.2f pct | optim - %.2f pct | mcmc - %.2f pct' %
(pair[0], pair[1], p_one_true*100., p_one_opt*100., p_one_mcmc*100.))
In [549]:
np.mean(np.power(err['mcmc'],2))
Out[549]:
In [550]:
np.mean(np.power(err['opt'],2))
Out[550]:
In [ ]:
In [7]:
import pymc3 as pm
import theano.tensor as tt
In [13]:
mapping = {v:k for k,v in dict(enumerate(['A', 'B', 'C', 'D', 'E'])).items()}
with pm.Model() as rating_model:
beta = pm.Uniform('beta', 30., 50.)
skills = pm.Normal('rating', 1000., 150., shape=n_teams)
performance = pm.Normal('performance', skills, beta, shape=n_teams)
diff = performance[obs['Team 1'].map(mapping).values] - performance[obs['Team 2'].map(mapping).values]
p = tt.nnet.sigmoid(0.005*diff)
err = pm.DensityDist('observed', lambda x: tt.sum(x*tt.log(p)+(1.-x)*tt.log(1-p)), observed=(obs['Team 1'] == obs['winner']).values)
#err = pm.Bernoulli('observed', p=p, observed=(obs['Team 1'] == obs['winner']).values)
In [ ]:
with rating_model:
#start = pm.find_MAP()
trace = pm.sample(10000, tune=0) #step=pm.Metropolis(), start=start,
In [ ]:
pm.traceplot(trace)
pm.plot_posterior(trace)
In [538]:
infer_mean = trace['rating'].mean(axis=0)
infer_std = trace['rating'].std(axis=0)
for pair, k in obs.groupby(['Team 1', 'Team 2']).count().itertuples():
param_one = true_team_skill[pair[0]]
param_two = true_team_skill[pair[1]]
p_one_true = 1.-norm.cdf(0, loc=param_one[0]-param_two[0], scale=np.sqrt(param_one[1]**2 + param_two[1]**2 + beta**2))
p_one_infer = (1./(1+np.exp(-0.005*(trace['rating'][:,mapping[pair[0]]] - trace['rating'][:,mapping[pair[1]]])))).mean()
#p_one_infer = 1.-norm.cdf(0, loc=infer_one[0]-infer_two[0], scale=np.sqrt(infer_one[1]**2 + infer_two[1]**2 + 50.**2))
print('%s vs %s : true - %.2f pct | infer - %.2f pct' % (pair[0], pair[1], p_one_true*100., p_one_infer*100.))