In [9]:
%matplotlib qt4
from models import tools, models, filters, tests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [16]:
data = tools.load_data(limit=20000)
train = data[filters.open_questions(data) & filters.world_countries(data)]
In [17]:
intervals = lambda r: [x**2 for x in range(5, int(np.sqrt(r)), 5)]
In [18]:
train_examples = []
train_errs, valid_errs = [], []
train_set, valid_set = tools.split_data(data)
limit = len(train_set)
for i in intervals(limit):
elo_test = tests.PerformanceTest(models.EloModel(), data)
elo_test.test_set = valid_set
elo_test.train_set = train_set[:i]
elo_test.run()
valid_set['prediction'] = valid_set.apply(elo_train.predict, axis=1)
train_err = elo_test.results['train'].rmse
valid_err = elo_test.results['test'].rmse
train_errs.append(train_err)
valid_errs.append(valid_err)
train_examples.append(i)
tools.echo('examples: {}, train RMSE: {}, cross RMSE: {}'.format(i, train_err, valid_err))
In [19]:
x = train_examples
plt.plot(x, train_errs, 'g-', label='training set')
plt.plot(x, valid_errs, 'r-', label='cross validation set')
plt.legend(loc='lower right')
plt.xscale('log')
plt.xlabel('Number of training examples.')
plt.ylabel('RMSE')
Out[19]:
In [ ]: