Incremental Learning with scikit-learn


In [1]:
from operator import or_

import numpy as np
import pandas as pd
from bson import BSON
from pymongo import cursor
from skll.metrics import kappa
from scipy.stats import pearsonr
from sklearn.cluster import MiniBatchKMeans
from sklearn.grid_search import ParameterGrid
from sklearn.naive_bayes import (BernoulliNB,
                                 MultinomialNB)
from sklearn.metrics import (precision_score,
                             f1_score,
                             accuracy_score,
                             confusion_matrix,
                             average_precision_score)
from sklearn.linear_model import (Perceptron,
                                  PassiveAggressiveRegressor)
from sklearn.feature_extraction import DictVectorizer

from src.features import *
from src.datasets import *
from src.mongodb import *

Data


In [2]:
# Running MongoDB on my own personal server (tunneled to localhost:37017
# in this case)
# Example: ssh -N -f -L localhost:37017:localhost:2700 mulhod@pool-108-24-47-200.cmdnnj.fios.verizon.net
host = 'localhost'
port = 37017
db = connect_to_db(host=host, port=port)

In [3]:
# Number of training/test reviews across all games
db.count()


Out[3]:
54051

In [4]:
# List games that the database contains data for
! ls ../data/*jsonlines | awk -F/ '{print $NF}'


Arma_3.jsonlines
Counter_Strike_Global_Offensive.jsonlines
Counter_Strike.jsonlines
Dota_2.jsonlines
Football_Manager_2015.jsonlines
Garrys_Mod.jsonlines
Grand_Theft_Auto_V.jsonlines
sample.jsonlines
Sid_Meiers_Civilization_5.jsonlines
Team_Fortress_2.jsonlines
The_Elder_Scrolls_V.jsonlines
Warframe.jsonlines

In [5]:
# Let's get a sense for the kind of data that is contained in each document
# (not including the NLP features, which have to be decoded, anyway)
db.find_one({}, {'nlp_features': 0})


Out[5]:
{'_id': ObjectId('560394d3cbb14611d0957f1c'),
 'achievement_progress': {'num_achievements_attained': 7,
  'num_achievements_percentage': 0.16279069767441862,
  'num_achievements_possible': 43},
 'appid': '107410',
 'bin_factor': 2.0,
 'bin_ranges': [[0.0, 338.1], [338.2, 1014.4], [1014.5, 2367.0]],
 'binarized': True,
 'date_posted': 'Dec 15, 2013, 7:32PM',
 'date_updated': None,
 'found_helpful_percentage': 0.5,
 'friend_player_level': 7,
 'game': 'Arma_3',
 'id_string': '560394d3cbb14611d0957f1c',
 'nbins': 3,
 'num_badges': 5,
 'num_comments': 1,
 'num_found_funny': 0,
 'num_found_helpful': 2,
 'num_found_unhelpful': 2,
 'num_friends': 35,
 'num_games_owned': 75,
 'num_groups': 7,
 'num_guides': 0,
 'num_reviews': 1,
 'num_screenshots': 789,
 'num_voted_helpfulness': 4,
 'num_workshop_items': 1,
 'orig_url': 'http://steamcommunity.com/app/107410/homecontent/?userreviewsoffset=5150&p=1&itemspage=516&screenshotspage=516&videospage=516&artpage=516&allguidepage=516&webguidepage=516&integratedguidepage=516&discussionspage=516&appid=107410&appHubSubSection=10&appHubSubSection=10&l=english&browsefilter=toprated&filterLanguage=default&searchText=&forceanon=1',
 'partition': 'training',
 'profile_url': 'http://steamcommunity.com/id/EthanTheFinn',
 'rating': 'Recommended',
 'review': '10 - Graphics 9.5 - Sound 10 - Gameplay 9.7 - Voice Acting 10 - Potential (Modding, and custom mission scenarios) Overal this is a great game!',
 'review_url': 'http://steamcommunity.com/id/EthanTheFinn/recommended/107410/',
 'steam_id_number': 'EthanTheFinn',
 'total_game_hours': 602.3,
 'total_game_hours_bin': 2,
 'total_game_hours_last_two_weeks': 0.0,
 'username': 'Ethan'}

In [6]:
# Review attributes
print('\n'.join(db.find_one({}, {'nlp_features': 0}).keys()))


date_posted
username
num_groups
found_helpful_percentage
num_workshop_items
total_game_hours_last_two_weeks
rating
partition
friend_player_level
achievement_progress
num_guides
num_games_owned
date_updated
review
total_game_hours
nbins
orig_url
num_comments
game
num_friends
num_voted_helpfulness
num_screenshots
num_badges
appid
binarized
review_url
bin_factor
_id
num_reviews
total_game_hours_bin
profile_url
steam_id_number
bin_ranges
id_string
num_found_funny
num_found_unhelpful
num_found_helpful

In [7]:
print('\n'.join(db.find_one({}, {'nlp_features': 0})['achievement_progress'].keys()))


num_achievements_percentage
num_achievements_possible
num_achievements_attained

In [8]:
# Let's also take a look at the NLP features that have been extracted
# from the review and stored in the database
nlp_features = (BSON.decode(db
                            .find_one({}, {'nlp_features': 1, '_id': 0})
                            .get('nlp_features')))
pd.DataFrame([dict(feature=feature, value=value) for feature, value
              in list(nlp_features.items())[:400]])


Out[8]:
feature value
0 at 1
1 Mod 1
2 10 - 1
3 ing 1 1
4 er 1
5 cluster27626 1
6 5 - 1
7 raph 1
8 ce A 1
9 miss 1
10 aph 1
11 9.7 - 1
12 Poten 1
13 So 1
14 miss 1
15 on sc 1
16 10 1
17 game:PRD:a 1
18 Ov 1
19 - Gra 1
20 ari 1
21 Soun 1
22 s 9.5 1
23 cluster20 1
24 on 1
25 d 1
26 ice A 1
27 al th 1
28 (Modd 1
29 n sc 1
... ... ...
370 th 1
371 s) O 1
372 issi 1
373 modding 1
374 eplay 1
375 Gr 1
376 9.7 1
377 - Vo 1
378 cluster6634 1
379 sce 1
380 Ove 1
381 eral 1
382 Act 1
383 .7 - 1
384 9.7 1
385 scenario:VMOD:( 1
386 rap 1
387 Gam 1
388 cust 1
389 gr 1
390 Gra 1
391 overal:SUB:this 1
392 9.7:VMOD:10 1
393 tial 1
394 entia 1
395 and c 1
396 la 1
397 cs 1
398 ( modding 1
399 sce 1

400 rows × 2 columns

Incremental Learning Experiment


In [11]:
# First let's import some code/variables from src.learn (part of this
# package), which will be useful in keeping this notebook clean and
# straightforward
from util.learn import *

In [12]:
# We will use a set of 2 learning algorithms (Perceptron and
# PassiveAgressiveRegressor) with reduced parameter grids
learners = [Perceptron, PassiveAggressiveRegressor]
_DEFAULT_PARAM_GRIDS = {Perceptron: {'alpha': [0.0001, 0.1],
                                     'random_state': [seed]},
                        PassiveAggressiveRegressor:
                            {'C': [0.01, 10.0],
                             'random_state': [seed]}}

In [10]:
# We will use Arma 3 as the game to train/evaluate a model for
game = 'Arma_3'
# We will run 5 rounds of learning with 50 new training samples being
# used in each round.
rounds = 5
n_training_samples = 50
# The model will be tested against the same 50 test reviews each time.
n_test_samples = 500
# Each unique set of values for each learner's parameter grid will be
# evaluated (in this case the only parameter that has multiple values
# is 'alpha' for Pereceptron and 'C' for PassiveAgressiveRegressor,
# so this means that only 4 experiments will be run in total, but
# each one will have its own 5 rounds)

In [11]:
# Besides the NLP features, we will not use any of the review attributes
# in the model
non_nlp_features = set()
# The attribute that we will be predicting on is the number of hours the
# reviewer played (or, more specifically, the "bin" into which the number
# falls when the whole range is broken down into bins)
y_label = 'total_game_hours_bin'

In [12]:
# The objective function we will use to rank the experiments will be
# quadratic weighted kappa
objective = 'qwk'

In [13]:
# Finally, we will also evaluate the majority baseline model to get a
# sense for how the model really performs

In [14]:
# Filter out warnings since there may be a lot of "UndefinedMetricWarning"
# warnings when running IncrementalLearning
import warnings
warnings.filterwarnings("ignore")

In [15]:
Arma_3_inc_learning = \
    IncrementalLearning(db,
                        game,
                        learners,
                        [_DEFAULT_PARAM_GRIDS[learner]
                         for learner in learners],
                        n_training_samples,
                        non_nlp_features,
                        y_label,
                        objective,
                        test_limit=n_test_samples,
                        rounds=rounds,
                        majority_baseline=True)


INFO:root:Incremental learning experiments initialized...
2015-10-28 01:10:54,896 - root - INFO - Incremental learning experiments initialized...
INFO:root:Round 1...
2015-10-28 01:10:56,228 - root - INFO - Round 1...
INFO:root:Round 2...
2015-10-28 01:11:02,069 - root - INFO - Round 2...
INFO:root:Round 3...
2015-10-28 01:11:12,081 - root - INFO - Round 3...
INFO:root:Round 4...
2015-10-28 01:11:22,085 - root - INFO - Round 4...
INFO:root:Round 5...
2015-10-28 01:11:45,929 - root - INFO - Round 5...

In [16]:
# Now, let's take a look at the results
# First, we'll see how the majority baseline model performs

# There is a lot of data collected for each model, so it will be necessary
# here to constrain the analysis
# Let's suppose that we only want to see the following attributes:
# accuracy, precision, Pearson's r, quadratic weighted kappa, and
# confusion matrices
Arma_3_inc_learning.majority_baseline_stats.columns


Out[16]:
Index(['accuracy', 'confusion_matrix', 'f1_macro', 'f1_weighted', 'game',
       'learner', 'lwk', 'lwk_off_by_one', 'majority_label', 'pearson_r',
       'precision_macro', 'precision_weighted', 'prediction_label',
       'printable_confusion_matrix', 'qwk', 'qwk_off_by_one', 'significance',
       'uwk', 'uwk_off_by_one'],
      dtype='object')

In [28]:
Arma_3_inc_learning.majority_baseline_stats[['accuracy',
                                             'precision_weighted',
                                             'pearson_r',
                                             'qwk']]


Out[28]:
accuracy precision_weighted pearson_r qwk
0 0.7 0.49 NaN 0

In [18]:
# As it turns out, quadratic weighted kappa and Pearson's r won't make
# sense in this case due to the fact that the majority baseline
# predictions are 100% one label, i.e., they're all the value that
# occurred most frequently
# However, accuracy and precision are included

In [19]:
# Let's take a look at the confusion matrix
print(Arma_3_inc_learning
      .majority_baseline_stats
      .printable_confusion_matrix
      .irow(0))


confusion_matrix (rounded predictions) (row=human, col=machine, labels=[1 2 3]):
	1	2	3
1	35	0	0
2	12	0	0
3	3	0	0


In [21]:
# As you can see, the baseline model predicted 1 in every single instance

In [22]:
# Now, let's rank the experiments by how well the model did in the last
# round
ranked_experiments = \
    (Arma_3_inc_learning
     .rank_experiments_by_objective(ordering='objective_last_round'))

In [26]:
# Let's find out which experiment did best (this time we'll include a
# little more information than we did for the majority baseline model)
# Here are the possible columns:
print('Experimental attributes:\n\n\t{}'
      .format('\n\t'.join(ranked_experiments[0])))


Experimental attributes:

	accuracy
	confusion_matrix
	f1_macro
	f1_weighted
	game
	learner
	learning_round
	lwk
	lwk_off_by_one
	non-NLP features
	params
	pearson_r
	precision_macro
	precision_weighted
	prediction_label
	printable_confusion_matrix
	qwk
	qwk_off_by_one
	significance
	test_set_labels/test_set_predictions
	training_samples
	uwk
	uwk_off_by_one

In [30]:
ranked_experiments[0][['learner',
                       'learning_round',
                       'accuracy',
                       'precision_weighted',
                       'pearson_r',
                       'qwk']]


Out[30]:
learner learning_round accuracy precision_weighted pearson_r qwk
0 PassiveAggressiveRegressor 1 0.76 0.766486 0.615896 0.598930
1 PassiveAggressiveRegressor 2 0.70 0.643683 0.492636 0.446086
2 PassiveAggressiveRegressor 3 0.74 0.640444 0.698262 0.488189
3 PassiveAggressiveRegressor 4 0.96 0.976000 0.957590 0.949495
4 PassiveAggressiveRegressor 5 0.90 0.926364 0.905036 0.882187

In [31]:
# So, it seems that the PassiveAgressiveRegressor takes the top
# prize
# Let's find out what set of parameters was used
ranked_experiments[0].params.irow(0)


Out[31]:
{'C': 0.01,
 'class_weight': None,
 'epsilon': 0.1,
 'fit_intercept': True,
 'loss': 'epsilon_insensitive',
 'n_iter': 5,
 'random_state': 123456789,
 'shuffle': True,
 'verbose': 0,
 'warm_start': False}

In [32]:
# When 'C' is set to 0.01 (and everything else is default), this learning
# algorithm seems to do best (in this one case, at least)

In [33]:
# Furthermore, we see a nice increase in performance over time from
# learning round #1 through learning round #5, at least in a general kind
# of way

In [36]:
# Let's see the confusion matrices and how they change over time
for i in range(5):
    print('Round #{}\n{}\n\n'.format(i + 1,
                                     ranked_experiments[0]
                                     .printable_confusion_matrix
                                     .irow(i)))


Round #1
confusion_matrix (rounded predictions) (row=human, col=machine, labels=[1 2 3]):
	1	2	3
1	31	4	0
2	6	6	0
3	0	2	1



Round #2
confusion_matrix (rounded predictions) (row=human, col=machine, labels=[1 2 3]):
	1	2	3
1	31	4	0
2	8	4	0
3	0	3	0



Round #3
confusion_matrix (rounded predictions) (row=human, col=machine, labels=[1 2 3]):
	1	2	3
1	35	0	0
2	10	2	0
3	0	3	0



Round #4
confusion_matrix (rounded predictions) (row=human, col=machine, labels=[1 2 3]):
	1	2	3
1	35	0	0
2	0	10	2
3	0	0	3



Round #5
confusion_matrix (rounded predictions) (row=human, col=machine, labels=[1 2 3]):
	1	2	3
1	33	2	0
2	0	9	3
3	0	0	3




In [37]:
# Little by little, one can see that the model begins to predict 2s
# and 3s
# Indeed, this is what separates the models from the majority baseline
# model

In [52]:
# Lastly, it should be mentioned that the best model does indeed do
# better than the majority baseline model. In terms of accuracy and
# precision, it does significantly better:
print('Precision:\n\n\tmajority baseline model: {}\n\t'
      'learning model:          {}'
      .format(Arma_3_inc_learning
              .majority_baseline_stats
              .precision_weighted
              .irow(0),
              ranked_experiments[0]
              .precision_weighted
              .irow(len(ranked_experiments[0]) - 1)))
print('\nAccuracy:\n\n\tmajority baseline model: {}\n\t'
      'learning model:          {}'
      .format(Arma_3_inc_learning
              .majority_baseline_stats
              .accuracy
              .irow(0),
              ranked_experiments[0]
              .accuracy
              .irow(len(ranked_experiments[0]) - 1)))


Precision:

	majority baseline model: 0.49
	learning model:          0.9263636363636364

Accuracy:

	majority baseline model: 0.7
	learning model:          0.9