Incremental Learning with scikit-learn



In [1]:

    
from operator import or_

import numpy as np
import pandas as pd
from bson import BSON
from pymongo import cursor
from skll.metrics import kappa
from scipy.stats import pearsonr
from sklearn.cluster import MiniBatchKMeans
from sklearn.grid_search import ParameterGrid
from sklearn.naive_bayes import (BernoulliNB,
                                 MultinomialNB)
from sklearn.metrics import (precision_score,
                             f1_score,
                             accuracy_score,
                             confusion_matrix,
                             average_precision_score)
from sklearn.linear_model import (Perceptron,
                                  PassiveAggressiveRegressor)
from sklearn.feature_extraction import DictVectorizer

from src.features import *
from src.datasets import *
from src.mongodb import *

Data



In [2]:

    
# Running MongoDB on my own personal server (tunneled to localhost:37017
# in this case)
# Example: ssh -N -f -L localhost:37017:localhost:2700 mulhod@pool-108-24-47-200.cmdnnj.fios.verizon.net
host = 'localhost'
port = 37017
db = connect_to_db(host=host, port=port)



In [3]:

    
# Number of training/test reviews across all games
db.count()









    Out[3]:





54051



In [4]:

    
# List games that the database contains data for
! ls ../data/*jsonlines | awk -F/ '{print $NF}'









    



Arma_3.jsonlines
Counter_Strike_Global_Offensive.jsonlines
Counter_Strike.jsonlines
Dota_2.jsonlines
Football_Manager_2015.jsonlines
Garrys_Mod.jsonlines
Grand_Theft_Auto_V.jsonlines
sample.jsonlines
Sid_Meiers_Civilization_5.jsonlines
Team_Fortress_2.jsonlines
The_Elder_Scrolls_V.jsonlines
Warframe.jsonlines



In [5]:

    
# Let's get a sense for the kind of data that is contained in each document
# (not including the NLP features, which have to be decoded, anyway)
db.find_one({}, {'nlp_features': 0})









    Out[5]:





{'_id': ObjectId('560394d3cbb14611d0957f1c'),
 'achievement_progress': {'num_achievements_attained': 7,
  'num_achievements_percentage': 0.16279069767441862,
  'num_achievements_possible': 43},
 'appid': '107410',
 'bin_factor': 2.0,
 'bin_ranges': [[0.0, 338.1], [338.2, 1014.4], [1014.5, 2367.0]],
 'binarized': True,
 'date_posted': 'Dec 15, 2013, 7:32PM',
 'date_updated': None,
 'found_helpful_percentage': 0.5,
 'friend_player_level': 7,
 'game': 'Arma_3',
 'id_string': '560394d3cbb14611d0957f1c',
 'nbins': 3,
 'num_badges': 5,
 'num_comments': 1,
 'num_found_funny': 0,
 'num_found_helpful': 2,
 'num_found_unhelpful': 2,
 'num_friends': 35,
 'num_games_owned': 75,
 'num_groups': 7,
 'num_guides': 0,
 'num_reviews': 1,
 'num_screenshots': 789,
 'num_voted_helpfulness': 4,
 'num_workshop_items': 1,
 'orig_url': 'http://steamcommunity.com/app/107410/homecontent/?userreviewsoffset=5150&p=1&itemspage=516&screenshotspage=516&videospage=516&artpage=516&allguidepage=516&webguidepage=516&integratedguidepage=516&discussionspage=516&appid=107410&appHubSubSection=10&appHubSubSection=10&l=english&browsefilter=toprated&filterLanguage=default&searchText=&forceanon=1',
 'partition': 'training',
 'profile_url': 'http://steamcommunity.com/id/EthanTheFinn',
 'rating': 'Recommended',
 'review': '10 - Graphics 9.5 - Sound 10 - Gameplay 9.7 - Voice Acting 10 - Potential (Modding, and custom mission scenarios) Overal this is a great game!',
 'review_url': 'http://steamcommunity.com/id/EthanTheFinn/recommended/107410/',
 'steam_id_number': 'EthanTheFinn',
 'total_game_hours': 602.3,
 'total_game_hours_bin': 2,
 'total_game_hours_last_two_weeks': 0.0,
 'username': 'Ethan'}



In [6]:

    
# Review attributes
print('\n'.join(db.find_one({}, {'nlp_features': 0}).keys()))









    



date_posted
username
num_groups
found_helpful_percentage
num_workshop_items
total_game_hours_last_two_weeks
rating
partition
friend_player_level
achievement_progress
num_guides
num_games_owned
date_updated
review
total_game_hours
nbins
orig_url
num_comments
game
num_friends
num_voted_helpfulness
num_screenshots
num_badges
appid
binarized
review_url
bin_factor
_id
num_reviews
total_game_hours_bin
profile_url
steam_id_number
bin_ranges
id_string
num_found_funny
num_found_unhelpful
num_found_helpful



In [7]:

    
print('\n'.join(db.find_one({}, {'nlp_features': 0})['achievement_progress'].keys()))









    



num_achievements_percentage
num_achievements_possible
num_achievements_attained



In [8]:

    
# Let's also take a look at the NLP features that have been extracted
# from the review and stored in the database
nlp_features = (BSON.decode(db
                            .find_one({}, {'nlp_features': 1, '_id': 0})
                            .get('nlp_features')))
pd.DataFrame([dict(feature=feature, value=value) for feature, value
              in list(nlp_features.items())[:400]])









    Out[8]:






  
    
      
      feature
      value
    
  
  
    
      0
      at
      1
    
    
      1
      Mod
      1
    
    
      2
      10 -
      1
    
    
      3
      ing 1
      1
    
    
      4
      er
      1
    
    
      5
      cluster27626
      1
    
    
      6
      5 -
      1
    
    
      7
      raph
      1
    
    
      8
      ce A
      1
    
    
      9
      miss
      1
    
    
      10
      aph
      1
    
    
      11
      9.7 -
      1
    
    
      12
      Poten
      1
    
    
      13
      So
      1
    
    
      14
      miss
      1
    
    
      15
      on sc
      1
    
    
      16
      10
      1
    
    
      17
      game:PRD:a
      1
    
    
      18
      Ov
      1
    
    
      19
      - Gra
      1
    
    
      20
      ari
      1
    
    
      21
      Soun
      1
    
    
      22
      s 9.5
      1
    
    
      23
      cluster20
      1
    
    
      24
      on
      1
    
    
      25
      d
      1
    
    
      26
      ice A
      1
    
    
      27
      al th
      1
    
    
      28
      (Modd
      1
    
    
      29
      n sc
      1
    
    
      ...
      ...
      ...
    
    
      370
      th
      1
    
    
      371
      s) O
      1
    
    
      372
      issi
      1
    
    
      373
      modding
      1
    
    
      374
      eplay
      1
    
    
      375
      Gr
      1
    
    
      376
      9.7
      1
    
    
      377
      - Vo
      1
    
    
      378
      cluster6634
      1
    
    
      379
      sce
      1
    
    
      380
      Ove
      1
    
    
      381
      eral
      1
    
    
      382
      Act
      1
    
    
      383
      .7 -
      1
    
    
      384
      9.7
      1
    
    
      385
      scenario:VMOD:(
      1
    
    
      386
      rap
      1
    
    
      387
      Gam
      1
    
    
      388
      cust
      1
    
    
      389
      gr
      1
    
    
      390
      Gra
      1
    
    
      391
      overal:SUB:this
      1
    
    
      392
      9.7:VMOD:10
      1
    
    
      393
      tial
      1
    
    
      394
      entia
      1
    
    
      395
      and c
      1
    
    
      396
      la
      1
    
    
      397
      cs
      1
    
    
      398
      ( modding
      1
    
    
      399
      sce
      1
    
  

400 rows × 2 columns

Incremental Learning Experiment



In [11]:

    
# First let's import some code/variables from src.learn (part of this
# package), which will be useful in keeping this notebook clean and
# straightforward
from util.learn import *



In [12]:

    
# We will use a set of 2 learning algorithms (Perceptron and
# PassiveAgressiveRegressor) with reduced parameter grids
learners = [Perceptron, PassiveAggressiveRegressor]
_DEFAULT_PARAM_GRIDS = {Perceptron: {'alpha': [0.0001, 0.1],
                                     'random_state': [seed]},
                        PassiveAggressiveRegressor:
                            {'C': [0.01, 10.0],
                             'random_state': [seed]}}



In [10]:

    
# We will use Arma 3 as the game to train/evaluate a model for
game = 'Arma_3'
# We will run 5 rounds of learning with 50 new training samples being
# used in each round.
rounds = 5
n_training_samples = 50
# The model will be tested against the same 50 test reviews each time.
n_test_samples = 500
# Each unique set of values for each learner's parameter grid will be
# evaluated (in this case the only parameter that has multiple values
# is 'alpha' for Pereceptron and 'C' for PassiveAgressiveRegressor,
# so this means that only 4 experiments will be run in total, but
# each one will have its own 5 rounds)



In [11]:

    
# Besides the NLP features, we will not use any of the review attributes
# in the model
non_nlp_features = set()
# The attribute that we will be predicting on is the number of hours the
# reviewer played (or, more specifically, the "bin" into which the number
# falls when the whole range is broken down into bins)
y_label = 'total_game_hours_bin'



In [12]:

    
# The objective function we will use to rank the experiments will be
# quadratic weighted kappa
objective = 'qwk'



In [13]:

    
# Finally, we will also evaluate the majority baseline model to get a
# sense for how the model really performs



In [14]:

    
# Filter out warnings since there may be a lot of "UndefinedMetricWarning"
# warnings when running IncrementalLearning
import warnings
warnings.filterwarnings("ignore")



In [15]:

    
Arma_3_inc_learning = \
    IncrementalLearning(db,
                        game,
                        learners,
                        [_DEFAULT_PARAM_GRIDS[learner]
                         for learner in learners],
                        n_training_samples,
                        non_nlp_features,
                        y_label,
                        objective,
                        test_limit=n_test_samples,
                        rounds=rounds,
                        majority_baseline=True)









    



INFO:root:Incremental learning experiments initialized...
2015-10-28 01:10:54,896 - root - INFO - Incremental learning experiments initialized...
INFO:root:Round 1...
2015-10-28 01:10:56,228 - root - INFO - Round 1...
INFO:root:Round 2...
2015-10-28 01:11:02,069 - root - INFO - Round 2...
INFO:root:Round 3...
2015-10-28 01:11:12,081 - root - INFO - Round 3...
INFO:root:Round 4...
2015-10-28 01:11:22,085 - root - INFO - Round 4...
INFO:root:Round 5...
2015-10-28 01:11:45,929 - root - INFO - Round 5...



In [16]:

    
# Now, let's take a look at the results
# First, we'll see how the majority baseline model performs

# There is a lot of data collected for each model, so it will be necessary
# here to constrain the analysis
# Let's suppose that we only want to see the following attributes:
# accuracy, precision, Pearson's r, quadratic weighted kappa, and
# confusion matrices
Arma_3_inc_learning.majority_baseline_stats.columns









    Out[16]:





Index(['accuracy', 'confusion_matrix', 'f1_macro', 'f1_weighted', 'game',
       'learner', 'lwk', 'lwk_off_by_one', 'majority_label', 'pearson_r',
       'precision_macro', 'precision_weighted', 'prediction_label',
       'printable_confusion_matrix', 'qwk', 'qwk_off_by_one', 'significance',
       'uwk', 'uwk_off_by_one'],
      dtype='object')



In [28]:

    
Arma_3_inc_learning.majority_baseline_stats[['accuracy',
                                             'precision_weighted',
                                             'pearson_r',
                                             'qwk']]









    Out[28]:






  
    
      
      accuracy
      precision_weighted
      pearson_r
      qwk
    
  
  
    
      0
      0.7
      0.49
      NaN
      0



In [18]:

    
# As it turns out, quadratic weighted kappa and Pearson's r won't make
# sense in this case due to the fact that the majority baseline
# predictions are 100% one label, i.e., they're all the value that
# occurred most frequently
# However, accuracy and precision are included



In [19]:

    
# Let's take a look at the confusion matrix
print(Arma_3_inc_learning
      .majority_baseline_stats
      .printable_confusion_matrix
      .irow(0))









    



confusion_matrix (rounded predictions) (row=human, col=machine, labels=[1 2 3]):
	1	2	3
1	35	0	0
2	12	0	0
3	3	0	0



In [21]:

    
# As you can see, the baseline model predicted 1 in every single instance



In [22]:

    
# Now, let's rank the experiments by how well the model did in the last
# round
ranked_experiments = \
    (Arma_3_inc_learning
     .rank_experiments_by_objective(ordering='objective_last_round'))



In [26]:

    
# Let's find out which experiment did best (this time we'll include a
# little more information than we did for the majority baseline model)
# Here are the possible columns:
print('Experimental attributes:\n\n\t{}'
      .format('\n\t'.join(ranked_experiments[0])))









    



Experimental attributes:

	accuracy
	confusion_matrix
	f1_macro
	f1_weighted
	game
	learner
	learning_round
	lwk
	lwk_off_by_one
	non-NLP features
	params
	pearson_r
	precision_macro
	precision_weighted
	prediction_label
	printable_confusion_matrix
	qwk
	qwk_off_by_one
	significance
	test_set_labels/test_set_predictions
	training_samples
	uwk
	uwk_off_by_one



In [30]:

    
ranked_experiments[0][['learner',
                       'learning_round',
                       'accuracy',
                       'precision_weighted',
                       'pearson_r',
                       'qwk']]









    Out[30]:






  
    
      
      learner
      learning_round
      accuracy
      precision_weighted
      pearson_r
      qwk
    
  
  
    
      0
      PassiveAggressiveRegressor
      1
      0.76
      0.766486
      0.615896
      0.598930
    
    
      1
      PassiveAggressiveRegressor
      2
      0.70
      0.643683
      0.492636
      0.446086
    
    
      2
      PassiveAggressiveRegressor
      3
      0.74
      0.640444
      0.698262
      0.488189
    
    
      3
      PassiveAggressiveRegressor
      4
      0.96
      0.976000
      0.957590
      0.949495
    
    
      4
      PassiveAggressiveRegressor
      5
      0.90
      0.926364
      0.905036
      0.882187



In [31]:

    
# So, it seems that the PassiveAgressiveRegressor takes the top
# prize
# Let's find out what set of parameters was used
ranked_experiments[0].params.irow(0)









    Out[31]:





{'C': 0.01,
 'class_weight': None,
 'epsilon': 0.1,
 'fit_intercept': True,
 'loss': 'epsilon_insensitive',
 'n_iter': 5,
 'random_state': 123456789,
 'shuffle': True,
 'verbose': 0,
 'warm_start': False}



In [32]:

    
# When 'C' is set to 0.01 (and everything else is default), this learning
# algorithm seems to do best (in this one case, at least)



In [33]:

    
# Furthermore, we see a nice increase in performance over time from
# learning round #1 through learning round #5, at least in a general kind
# of way



In [36]:

    
# Let's see the confusion matrices and how they change over time
for i in range(5):
    print('Round #{}\n{}\n\n'.format(i + 1,
                                     ranked_experiments[0]
                                     .printable_confusion_matrix
                                     .irow(i)))









    



Round #1
confusion_matrix (rounded predictions) (row=human, col=machine, labels=[1 2 3]):
	1	2	3
1	31	4	0
2	6	6	0
3	0	2	1



Round #2
confusion_matrix (rounded predictions) (row=human, col=machine, labels=[1 2 3]):
	1	2	3
1	31	4	0
2	8	4	0
3	0	3	0



Round #3
confusion_matrix (rounded predictions) (row=human, col=machine, labels=[1 2 3]):
	1	2	3
1	35	0	0
2	10	2	0
3	0	3	0



Round #4
confusion_matrix (rounded predictions) (row=human, col=machine, labels=[1 2 3]):
	1	2	3
1	35	0	0
2	0	10	2
3	0	0	3



Round #5
confusion_matrix (rounded predictions) (row=human, col=machine, labels=[1 2 3]):
	1	2	3
1	33	2	0
2	0	9	3
3	0	0	3



In [37]:

    
# Little by little, one can see that the model begins to predict 2s
# and 3s
# Indeed, this is what separates the models from the majority baseline
# model



In [52]:

    
# Lastly, it should be mentioned that the best model does indeed do
# better than the majority baseline model. In terms of accuracy and
# precision, it does significantly better:
print('Precision:\n\n\tmajority baseline model: {}\n\t'
      'learning model:          {}'
      .format(Arma_3_inc_learning
              .majority_baseline_stats
              .precision_weighted
              .irow(0),
              ranked_experiments[0]
              .precision_weighted
              .irow(len(ranked_experiments[0]) - 1)))
print('\nAccuracy:\n\n\tmajority baseline model: {}\n\t'
      'learning model:          {}'
      .format(Arma_3_inc_learning
              .majority_baseline_stats
              .accuracy
              .irow(0),
              ranked_experiments[0]
              .accuracy
              .irow(len(ranked_experiments[0]) - 1)))









    



Precision:

	majority baseline model: 0.49
	learning model:          0.9263636363636364

Accuracy:

	majority baseline model: 0.7
	learning model:          0.9

	feature	value
0	at	1
1	Mod	1
2	10 -	1
3	ing 1	1
4	er	1
5	cluster27626	1
6	5 -	1
7	raph	1
8	ce A	1
9	miss	1
10	aph	1
11	9.7 -	1
12	Poten	1
13	So	1
14	miss	1
15	on sc	1
16	10	1
17	game:PRD:a	1
18	Ov	1
19	- Gra	1
20	ari	1
21	Soun	1
22	s 9.5	1
23	cluster20	1
24	on	1
25	d	1
26	ice A	1
27	al th	1
28	(Modd	1
29	n sc	1
...	...	...
370	th	1
371	s) O	1
372	issi	1
373	modding	1
374	eplay	1
375	Gr	1
376	9.7	1
377	- Vo	1
378	cluster6634	1
379	sce	1
380	Ove	1
381	eral	1
382	Act	1
383	.7 -	1
384	9.7	1
385	scenario:VMOD:(	1
386	rap	1
387	Gam	1
388	cust	1
389	gr	1
390	Gra	1
391	overal:SUB:this	1
392	9.7:VMOD:10	1
393	tial	1
394	entia	1
395	and c	1
396	la	1
397	cs	1
398	( modding	1
399	sce	1

	learner	learning_round	accuracy	precision_weighted	pearson_r	qwk
0	PassiveAggressiveRegressor	1	0.76	0.766486	0.615896	0.598930
1	PassiveAggressiveRegressor	2	0.70	0.643683	0.492636	0.446086
2	PassiveAggressiveRegressor	3	0.74	0.640444	0.698262	0.488189
3	PassiveAggressiveRegressor	4	0.96	0.976000	0.957590	0.949495
4	PassiveAggressiveRegressor	5	0.90	0.926364	0.905036	0.882187