In [1]:
from __future__ import division

import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from ggplot import *

import rating_utils

%matplotlib inline

Data processing

We load the training data and split it into training and test set


In [2]:
data = pd.read_csv('data/12-13.csv')

indices = np.random.rand(len(data)) < 0.7

train = data[indices].reset_index()
test = data[~indices].reset_index()

print rating_utils.get_teams(data)

print len(train), len(test)


['Washington Wizards' 'Dallas Mavericks' 'Boston Celtics'
 'Sacramento Kings' 'Houston Rockets' 'Memphis Grizzlies'
 'San Antonio Spurs' 'Denver Nuggets' 'Golden State Warriors'
 'Los Angeles Lakers' 'Indiana Pacers' 'Oklahoma City Thunder'
 'Milwaukee Bucks' 'Chicago Bulls' 'Los Angeles Clippers' 'Utah Jazz'
 'Miami Heat' 'Portland Trail Blazers' 'Detroit Pistons' 'Toronto Raptors'
 'New Orleans Hornets' 'Charlotte Bobcats' 'Cleveland Cavaliers'
 'Philadelphia 76ers' 'Atlanta Hawks' 'Phoenix Suns'
 'Minnesota Timberwolves' 'New York Knicks' 'Orlando Magic' 'Brooklyn Nets']
883 346

Let's plot the score difference distribution


In [3]:
ggplot(pd.DataFrame({'score_difference': data['HomePTS'] - data['VisitorPTS']}), aes(x='score_difference')) \
    + geom_histogram(fill='#33bbff', binwidth=2) \
    + ylab('match count')


/home/young/anaconda2/lib/python2.7/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
Out[3]:
<ggplot: (8775767777893)>

1. Random baseline

We generate random ratings with a standard Gaussian variable. This will be the baseline for comparison.


In [4]:
random_model = rating_utils.RandomRating()
random_model.fit(train, rating_utils.get_teams(data))
print random_model.ratings

plt.hist(random_model.ratings)
pass


[ 0.79567096  0.14754553  2.71883523  0.39473101 -1.8430706   0.00447958
 -0.34813816 -1.49043727 -0.42257538  1.61168059 -0.32742592  1.15271058
  1.57718275 -0.91628591 -0.20130521  1.65651027  0.24797673 -0.98143717
  0.78004954  0.96273178 -0.36682807 -0.28831726 -1.02841037  0.29900253
 -1.28669059  1.87437749  1.5011294   1.7656079   0.50091261  1.18715367]

We now compute the prediction accuracy and cross entropy loss


In [5]:
print "Training and testing accuracy:"
print random_model.test_accuracy(train), random_model.test_accuracy(test)
print "\nTraining and testing cross entropy loss:"
print random_model.test_cross_entropy_loss(train), random_model.test_cross_entropy_loss(test)


Training and testing accuracy:
0.481313703284 0.476878612717

Training and testing cross entropy loss:
0.980140489866 1.01310834233

2. Fit and test Massey's method

We fit and test Massey's method.

We calculate the ratings and plot out the distribution.


In [6]:
massey_model = rating_utils.MasseyMethod()
massey_model.fit(train, rating_utils.get_teams(data))
print massey_model.ratings

plt.hist(massey_model.ratings)
pass


[ -2.44411343e+00   2.33502629e-02  -4.16802978e-03  -4.04243405e+00
   4.35243335e+00   3.85428723e+00   6.45396563e+00   5.00666388e+00
   1.30333867e+00   2.10765449e+00   1.78477683e+00   9.63386422e+00
  -2.61287789e+00   2.26168787e-01   7.45399221e+00   6.05793510e-01
   7.49771544e+00  -3.56801718e+00  -3.55278373e+00  -3.08138296e+00
  -3.09442370e+00  -1.01056266e+01  -4.58656753e+00  -2.99553130e+00
  -4.16827169e-01  -5.93683332e+00  -1.41767769e+00   3.86588103e+00
  -6.95240842e+00   6.41787464e-01]

We now compute the prediction accuracy and cross entropy loss


In [7]:
print "Training and testing accuracy:"
print massey_model.test_accuracy(train), massey_model.test_accuracy(test)
print "\nTraining and testing cross entropy loss:"
print massey_model.test_cross_entropy_loss(train), massey_model.test_cross_entropy_loss(test)


Training and testing accuracy:
0.687429218573 0.682080924855

Training and testing cross entropy loss:
0.595424327776 0.627697151595

3. Fit and test Markov's method on matches

We fit and test Markov's method on match results.

We calculate the ratings and plot out the distribution.


In [8]:
markov_match_model = rating_utils.MarkovMatchMethod()
markov_match_model.fit(train, rating_utils.get_teams(data))
print markov_match_model.ratings

plt.hist(markov_match_model.ratings)
pass


[ 0.03293325  0.03136271  0.03300239  0.02307472  0.04178854  0.04571438
  0.05143692  0.0517333   0.04031221  0.03387243  0.04484356  0.04530344
  0.02862582  0.04165038  0.05016916  0.03066735  0.05409349  0.0308616
  0.01818625  0.02365773  0.01968396  0.01228682  0.0194283   0.03064052
  0.02534181  0.01626492  0.02620865  0.04849667  0.01166362  0.03669509]

We now compute the prediction accuracy and cross entropy loss


In [9]:
print "Training and testing accuracy:"
print markov_match_model.test_accuracy(train), markov_match_model.test_accuracy(test)
print "\nTraining and testing cross entropy loss:"
print markov_match_model.test_cross_entropy_loss(train), markov_match_model.test_cross_entropy_loss(test)


Training and testing accuracy:
0.680634201586 0.664739884393

Training and testing cross entropy loss:
0.613967832856 0.635235597369

4. Fit and test Markov's method on score

We fit and test Markov's method on match score.

We calculate the ratings and plot out the distribution.


In [10]:
markov_score_model = rating_utils.MarkovScoreMethod()
markov_score_model.fit(train, rating_utils.get_teams(data))
print markov_score_model.ratings

plt.hist(markov_score_model.ratings)
pass


[ 0.02463033  0.02586289  0.02464476  0.02108378  0.05458431  0.04833711
  0.06818221  0.0622169   0.03960529  0.03433933  0.04382183  0.06708902
  0.01919256  0.03317188  0.06132977  0.03081374  0.0559672   0.02016107
  0.0168493   0.01567889  0.0193361   0.00740474  0.01377163  0.02483105
  0.02060649  0.0145523   0.03161613  0.05874523  0.01168215  0.02989203]

We now compute the prediction accuracy and cross entropy loss


In [11]:
print "Training and testing accuracy:"
print markov_score_model.test_accuracy(train), markov_score_model.test_accuracy(test)
print "\nTraining and testing cross entropy loss:"
print markov_score_model.test_cross_entropy_loss(train), markov_score_model.test_cross_entropy_loss(test)


Training and testing accuracy:
0.679501698754 0.667630057803

Training and testing cross entropy loss:
0.603819260159 0.632616231511

5. Fit and test Colley's method

We fit and test Colley's method.

We calculate the ratings and plot out the distribution.


In [12]:
colley_model = rating_utils.ColleyMethod()
colley_model.fit(train, rating_utils.get_teams(data))
print colley_model.ratings

plt.hist(colley_model.ratings)
pass


[ 0.15223943  0.00990993 -0.01046627  0.13592572 -0.09297749 -0.15042266
 -0.17317308 -0.12482654 -0.08909133 -0.04758242 -0.01420618 -0.20478715
  0.08134517 -0.04985288 -0.20801193 -0.01207007 -0.26277145  0.124549
  0.16827019  0.10748285  0.17434186  0.29088243  0.20806176  0.07549322
  0.01817214  0.22590271  0.09096821 -0.13568936  0.28759781 -0.05797225]

We now compute the prediction accuracy and cross entropy loss


In [13]:
print "Training and testing accuracy:"
print colley_model.test_accuracy(train), colley_model.test_accuracy(test)
print "\nTraining and testing cross entropy loss:"
print colley_model.test_cross_entropy_loss(train), colley_model.test_cross_entropy_loss(test)


Training and testing accuracy:
0.678369195923 0.664739884393

Training and testing cross entropy loss:
0.62924294213 0.710944544422