In [9]:
%matplotlib qt4
from models import tools, models, filters, tests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [16]:
data = tools.load_data(limit=20000)
train = data[filters.open_questions(data) & filters.world_countries(data)]

In [17]:
intervals = lambda r: [x**2 for x in range(5, int(np.sqrt(r)), 5)]

In [18]:
train_examples = []
train_errs, valid_errs = [], []

train_set, valid_set = tools.split_data(data)
limit = len(train_set)

for i in intervals(limit):    
    elo_test = tests.PerformanceTest(models.EloModel(), data)
    elo_test.test_set = valid_set
    elo_test.train_set = train_set[:i]
    elo_test.run()

    valid_set['prediction'] = valid_set.apply(elo_train.predict, axis=1)
    
    train_err = elo_test.results['train'].rmse
    valid_err = elo_test.results['test'].rmse
    
    train_errs.append(train_err)
    valid_errs.append(valid_err)
    train_examples.append(i)

    tools.echo('examples: {}, train RMSE: {}, cross RMSE: {}'.format(i, train_err, valid_err))


examples: 13225, train RMSE: 0.373162423621, cross RMSE: 0.424687572001

In [19]:
x = train_examples
plt.plot(x, train_errs, 'g-', label='training set')
plt.plot(x, valid_errs, 'r-', label='cross validation set')
plt.legend(loc='lower right')
plt.xscale('log')
plt.xlabel('Number of training examples.')
plt.ylabel('RMSE')


Out[19]:
<matplotlib.text.Text at 0x7fe7eefeb350>

In [ ]: