In [2]:
import pandas as pd
import numpy as np
from scipy.stats import norm, bernoulli
%matplotlib inline
Generate players and Data
In [15]:
true_team_skill = {
'A': {
'A Good': [500, 70],
'A Decent': [400, 60],
'A Ok': [350, 40],
},
'B': {
'B Amazing': [600, 50],
'B Decent': [400, 60],
'B Bad': [150, 70],
},
'C': {
'C Good': [350, 70],
'C Good': [350, 70],
'C Good': [350, 70],
},
'D': {
'D Good': [350, 70],
'D Inconsistent': [550, 200],
'D Consistent': [450, 40],
},
'E': {
'E Bad': [250, 70],
'E Inconsistent': [400, 220],
'E Bad': [150, 70],
},
}
beta = 50
num_matches = 300
n_teams = len(true_team_skill)
match_ups = np.random.choice(list(true_team_skill.keys()), (num_matches, 2), p=[0.1, 0.25, 0.25, 0.35, 0.05])
match_ups = match_ups[match_ups[:,0] != match_ups[:,1]]
print("%i Number of Generated Matchups" % len(match_ups))
In [12]:
winner = []
for match in match_ups:
team_one = true_team_skill[match[0]]
team_two = true_team_skill[match[1]]
team_skill_one = np.sum([x[0] for i,x in team_one.items()])
team_skill_two = np.sum([x[0] for i,x in team_two.items()])
team_sigma_one = np.sum([x[1]**2 for i,x in team_one.items()])
team_sigma_two = np.sum([x[1]**2 for i,x in team_two.items()])
break
p_one = 1.-norm.cdf(0, loc=team_skill_one[0]-team_skill_two[0], scale=np.sqrt(param_one[1]**2 + param_two[1]**2 + beta**2))
res = bernoulli.rvs(p_one)
#print('%s vs %s - p: %.1f - %s won' % (match[0], match[1], p_one*100., match[int(np.logical_not(res))]))
winner.append(match[int(np.logical_not(res))])
In [14]:
np.sqrt(team_sigma_two)
Out[14]:
In [394]:
obs = pd.DataFrame(match_ups, columns=['Team 1', 'Team 2'])
obs['winner'] = winner
obs.head()
Out[394]:
In [539]:
from scipy.optimize import minimize
skills_0 = np.array([1000.]*5 + [150.]*5 + [50.])
def loglike(y,p):
return -1.*(np.sum(y*np.log(p)+(1.-y)*np.log(1-p)))
def obj(skills):
beta = 50.
mean_diff = skills[obs['Team 1'].map(mapping).values] - skills[obs['Team 2'].map(mapping).values]
var_diff = skills[obs['Team 1'].map(mapping).values + 5]**2 + skills[obs['Team 1'].map(mapping).values + 5]**2 + skills[-1]**2
p = 1.-norm.cdf(0., loc=mean_diff, scale = np.sqrt(var_diff))
return loglike((obs['Team 1'] == obs['winner']).values, p)
In [540]:
g = minimize(obj, x0=skills_0)
In [541]:
opt_skill = g.x
print(opt_skill)
plots = norm.rvs(opt_skill[:5], opt_skill[5:-1], size=(2000,5))
f, ax = plt.subplots(figsize=(12,8))
[sns.kdeplot(plots[:,i], shade=True, alpha=0.55, legend=True, ax=ax, label=i) for i in range(5)]
Out[541]:
In [548]:
infer_mean = opt_skill[:5]
infer_std = opt_skill[5:-1]
infer_beta = opt_skill[-1]
err = {'mcmc': [], 'opt': []}
for pair, k in obs.groupby(['Team 1', 'Team 2']).count().itertuples():
param_one = true_team_skill[pair[0]]
param_two = true_team_skill[pair[1]]
p_one_true = 1.-norm.cdf(0, loc=param_one[0]-param_two[0], scale=np.sqrt(param_one[1]**2 + param_two[1]**2 + 50.**2))
p_one_opt = 1.-norm.cdf(0, loc=infer_mean[mapping[pair[0]]]-infer_mean[mapping[pair[1]]], scale=np.sqrt(infer_std[mapping[pair[0]]]**2 + infer_std[mapping[pair[1]]]**2 + infer_beta**2))
p_one_mcmc = (1./(1+np.exp(-0.005*(trace['rating'][:,mapping[pair[0]]] - trace['rating'][:,mapping[pair[1]]])))).mean()
err['mcmc'].append(p_one_true-p_one_mcmc); err['opt'].append(p_one_true-p_one_opt);
print('%s vs %s : true - %.2f pct | optim - %.2f pct | mcmc - %.2f pct' %
(pair[0], pair[1], p_one_true*100., p_one_opt*100., p_one_mcmc*100.))
In [549]:
np.mean(np.power(err['mcmc'],2))
Out[549]:
In [550]:
np.mean(np.power(err['opt'],2))
Out[550]:
In [ ]:
In [533]:
import pymc3 as pm
import theano.tensor as tt
In [534]:
mapping = {v:k for k,v in dict(enumerate(['A', 'B', 'C', 'D', 'E'])).items()}
with pm.Model() as rating_model:
#beta = pm.Normal('beta', 50., 10.)
skills = pm.Normal('rating', 1000., 150., shape=n_teams)
performance = pm.Normal('performance', skills, 50., shape=n_teams)
diff = performance[obs['Team 1'].map(mapping).values] - performance[obs['Team 2'].map(mapping).values]
p = tt.nnet.sigmoid(0.005*diff)
err = pm.DensityDist('observed', lambda x: tt.sum(x*tt.log(p)+(1.-x)*tt.log(1-p)), observed=(obs['Team 1'] == obs['winner']).values)
#err = pm.Bernoulli('observed', p=p, observed=(obs['Team 1'] == obs['winner']).values)
In [535]:
with rating_model:
#start = pm.find_MAP()
trace = pm.sample(10000, tune=0) #step=pm.Metropolis(), start=start,
In [536]:
pm.traceplot(trace)
pm.plot_posterior(trace)
Out[536]:
In [538]:
infer_mean = trace['rating'].mean(axis=0)
infer_std = trace['rating'].std(axis=0)
for pair, k in obs.groupby(['Team 1', 'Team 2']).count().itertuples():
param_one = true_team_skill[pair[0]]
param_two = true_team_skill[pair[1]]
p_one_true = 1.-norm.cdf(0, loc=param_one[0]-param_two[0], scale=np.sqrt(param_one[1]**2 + param_two[1]**2 + beta**2))
p_one_infer = (1./(1+np.exp(-0.005*(trace['rating'][:,mapping[pair[0]]] - trace['rating'][:,mapping[pair[1]]])))).mean()
#p_one_infer = 1.-norm.cdf(0, loc=infer_one[0]-infer_two[0], scale=np.sqrt(infer_one[1]**2 + infer_two[1]**2 + 50.**2))
print('%s vs %s : true - %.2f pct | infer - %.2f pct' % (pair[0], pair[1], p_one_true*100., p_one_infer*100.))