In [1]:
%load_ext watermark

In [2]:
%watermark -d -v -a 'Sebastian Raschka' -p scikit-learn,nltk,numpy


Sebastian Raschka 13/12/2014 

CPython 3.4.2
IPython 2.3.0

scikit-learn 0.15.2
nltk 3.0.0
numpy 1.9.1

[More information](https://github.com/rasbt/watermark) about the `watermark` magic command extension.



Lyrics Mood Classification - Training



Sections



Preprocessing



Reading the dataset


In [3]:
import pandas as pd

df_train = pd.read_csv('../../dataset/training/train_lyrics_1000.csv')
df_test = pd.read_csv('../../dataset/validation/valid_lyrics_200.csv')

X_train = df_train['lyrics'].values 
y_train = df_train['mood'].values

X_test = df_test['lyrics'].values 
y_test = df_test['mood'].values

df_train.head()


Out[3]:
file artist title lyrics mood year
0 TRAAAAW128F429D538.h5 Casual I Didn't Mean To Verse One:\n\nAlright I might\nHave had a litt... sad 1994
1 TRAAAEF128F4273421.h5 Adam Ant Something Girls Adam Ant/Marco Pirroni\nEvery girl is a someth... happy 1982
2 TRAAAFD128F92F423A.h5 Gob Face the Ashes I've just erased it's been a while, I've got a... sad 2007
3 TRAABJV128F1460C49.h5 Lionel Richie Tonight Will Be Alright Little darling \nWhere you've been so long \nI... happy 1986
4 TRAABLR128F423B7E3.h5 Blue Rodeo Floating Lead Vocal by Greg\n\nWell, these late night c... sad 1987



Label Encoder


In [4]:
import pickle
import numpy as np

pickle_in = open('./label_encoder.p', 'rb')
le = pickle.load(pickle_in)
pickle_in.close()

print('before: %s ...' %y_train[:5])

y_train = le.transform(y_train)
y_test = le.transform(y_test)

print('after: %s ...' %y_train[:5])


before: ['sad' 'happy' 'sad' 'happy' 'sad'] ...
after: [1 0 1 0 1] ...



Stop word list


In [5]:
with open('./stopwords_eng.txt') as sw:
    stopwords = sw.read().split()
stopwords[-10:]


Out[5]:
['too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']



Feature extraction: Word counts and Vectorizers

Initial vocabularies


In [6]:
# With Porter Stemming
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
porter = PorterStemmer()

import re

vec_porter = CountVectorizer(analyzer='word',
                      decode_error='replace',
                      tokenizer=lambda text: text.split(),
                      preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), 
                      stop_words=stopwords,) 


vec_porter.fit(X_train)

print('Vocabular size: %s words' % len(vec_porter.vocabulary_))

vocab_1 = vec_porter.get_feature_names()
print(vocab_1[-10:])


Vocabular size: 11378 words
['zipped', 'zippin', 'zipping', 'zippo', 'zodiac', 'zoli', 'zombie', 'zone', 'zonin', 'zoo']

In [7]:
# With Porter Stemming

porter = PorterStemmer()

vec_porter = CountVectorizer(analyzer='word',
                      decode_error='replace',
                      tokenizer=lambda text: [porter.stem(word) for word in text.split()],
                      preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), 
                      stop_words=stopwords,) 


vec_porter.fit(X_train)

print('Vocabular size: %s words' % len(vec_porter.vocabulary_))

vocab_2 = vec_porter.get_feature_names()
print(vocab_2[-10:])


Vocabular size: 8551 words
['zip', 'ziplock', 'zippin', 'zippo', 'zodiac', 'zoli', 'zombi', 'zone', 'zonin', 'zoo']



Whitelists of positive and negative words


In [8]:
with open('./whitelist_dicts/negative-words.txt', 'r') as w:
    negative_words = w.read().split()
print('number of negative words: %s' % len(negative_words))
negative_words[-5:]


number of negative words: 4783
Out[8]:
['zaps', 'zealot', 'zealous', 'zealously', 'zombie']

In [9]:
with open('./whitelist_dicts/positive-words.txt', 'r') as w:
    positive_words = w.read().split()
print('number of positive words: %s' % len(positive_words))
positive_words[-5:]


number of positive words: 2006
Out[9]:
['youthful', 'zeal', 'zenith', 'zest', 'zippy']

In [10]:
negative_words = set(negative_words)
positive_words = set(positive_words)
semantic_words = negative_words.union(positive_words)

#pickle.dump(semantic_words, open('./whitelist_dicts/semantic_words.p', 'wb'))




In [11]:
# With whitelist 

# With Porter Stemming
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
import re

porter = PorterStemmer()

vec_porter = CountVectorizer(analyzer='word',
                      decode_error='replace',
                      tokenizer=lambda text: [word for word in text.split() if word in semantic_words],
                      preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), 
                      stop_words=stopwords,) 


vec_porter.fit(X_train)

print('Vocabular size: %s words' % len(vec_porter.vocabulary_))

vocab_3 = vec_porter.get_feature_names()
print(vocab_3[-10:])


Vocabular size: 1666 words
['wrath', 'wreck', 'wrest', 'wrestle', 'wretched', 'wrong', 'yawn', 'yay', 'zest', 'zombie']

In [12]:
# With whitelist and Porter Stemming

# With Porter Stemming
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
import re

porter = PorterStemmer()

vec_porter = CountVectorizer(analyzer='word',
                      decode_error='replace',
                      tokenizer=lambda text: [porter.stem(word) for word in text.split() if word in semantic_words],
                      preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), 
                      stop_words=stopwords,) 


vec_porter.fit(X_train)

print('Vocabular size: %s words' % len(vec_porter.vocabulary_))

vocab_4 = vec_porter.get_feature_names()
print(vocab_4[-10:])


Vocabular size: 1349 words
['wrath', 'wreck', 'wrest', 'wrestl', 'wretch', 'wrong', 'yawn', 'yay', 'zest', 'zombi']



Summary vocabulary sizes


In [14]:
print('All words: %s ' % len(vocab_1))
print('Porter: %s ' % len(vocab_2))
print('Whitelist: %s ' % len(vocab_3))
print('Porter + Whitelist: %s ' % len(vocab_4))


All words: 11378 
Porter: 8551 
Whitelist: 1666 
Porter + Whitelist: 1349 





Model Selection



Setup


In [13]:
# Requirements

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV
from nltk.stem import PorterStemmer
import re
import pickle
from sklearn import metrics

porter = PorterStemmer()

semantic_words = pickle.load(open('./whitelist_dicts/semantic_words.p', 'rb'))

with open('./stopwords_eng.txt') as sw:
    stop_words = sw.read().split()
    
f1_scorer = metrics.make_scorer(metrics.f1_score, greater_is_better=True, pos_label=0)




In [14]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer

porter = PorterStemmer()
snowball = EnglishStemmer()

# raw words
tokenizer = lambda text: text.split()

# words after Porter stemming 
tokenizer_porter = lambda text: [porter.stem(word) for word in text.split()]

# Words after Snowball stemming
tokenizer_snowball = lambda text: [snowball.stem(word) for word in text.split()]

# Only words that are in a list of 'positive' or 'negative' words ('whitelist')
# http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon
tokenizer_whitelist = lambda text: [word for word in text.split() if word in semantic_words]

# Porter-stemmed words in whitelist
tokenizer_porter_wl = lambda text: [porter.stem(word) for word in text.split() if word in semantic_words]

# Snowball-stemmed words in whitelist
tokenizer_snowball_wl = lambda text: [snowball.stem(word) for word in text.split() if word in semantic_words]

In [86]:
pipeline_1 = Pipeline([
    ('vect', CountVectorizer(binary=False,
                             stop_words=stop_words,
                             ngram_range=(1,1),
                             preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                             )),
    ('clf', MultinomialNB(fit_prior=False, 
                          alpha=1.0))
])

parameters_1 = dict(
    vect__tokenizer=[tokenizer, tokenizer_porter, tokenizer_whitelist, tokenizer_porter_wl],
)

grid_search_1 = GridSearchCV(pipeline_1, 
                           parameters_1, 
                           n_jobs=1, 
                           verbose=1,
                           scoring=f1_scorer,
                           cv=10
                )


print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_1.steps])
grid_search_1.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search_1.best_score_)
print("Best parameters set:")
best_parameters_1 = grid_search_1.best_estimator_.get_params()
for param_name in sorted(parameters_1.keys()):
    print("\t%s: %r" % (param_name, best_parameters_1[param_name]))


[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    1.2s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.2min finished
Performing grid search...
pipeline: ['vect', 'clf']
Fitting 10 folds for each of 4 candidates, totalling 40 fits
Best score: 0.632
Best parameters set:
	vect__tokenizer: <function <lambda> at 0x10a4d2f28>

In [87]:
pipeline_2 = Pipeline([
    ('vect', TfidfVectorizer(binary=False,
                             stop_words=stop_words,
                             ngram_range=(1,1),
                             preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                             )),
    ('clf', MultinomialNB(fit_prior=False, 
                          alpha=1.0))
])

parameters_2 = dict(
    vect__tokenizer=[tokenizer, tokenizer_porter, tokenizer_whitelist, tokenizer_porter_wl],
)

grid_search_2 = GridSearchCV(pipeline_2, 
                           parameters_2, 
                           n_jobs=1, 
                           verbose=1,
                           scoring=f1_scorer,
                           cv=10
                )


print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_2.steps])
grid_search_2.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search_2.best_score_)
print("Best parameters set:")
best_parameters_2 = grid_search_2.best_estimator_.get_params()
for param_name in sorted(parameters_2.keys()):
    print("\t%s: %r" % (param_name, best_parameters_1[param_name]))


[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    1.0s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.0min finished
Performing grid search...
pipeline: ['vect', 'clf']
Fitting 10 folds for each of 4 candidates, totalling 40 fits
Best score: 0.606
Best parameters set:
	vect__tokenizer: <function <lambda> at 0x10a4d2f28>



ROC Curves


In [15]:
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import StratifiedKFold
from scipy import interp

sns.set()
sns.set_style("whitegrid")

vect_1 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer)

vect_2 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter)
    
vect_3 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball)  

vect_4 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_whitelist)  

vect_5 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter_wl)

vect_6 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball_wl)

vect_7 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer)

vect_8 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter)
    
vect_9 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball)

vect_10 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_whitelist)    

vect_11 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter_wl)

vect_12 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball_wl)

pipelines = []
vectorizers = [vect_1, vect_2, vect_3, vect_4, vect_5, vect_6, vect_7, vect_8, vect_9, vect_10, vect_11, vect_12]
for v in vectorizers:
    pipelines.append(Pipeline([('vect', v),
                               ('clf', MultinomialNB(fit_prior=False, alpha=1.0))]))
    
for v in vectorizers[:6]:
    pipelines.append(Pipeline([('vect', v),
                               ('clf', BernoulliNB(fit_prior=False, alpha=1.0))]))

In [17]:
colors = ['blue', 'red', 'black', 'green', 'cyan', 'magenta']
linestyles = ['-', '--', '-.', ':', '-', '--']
classifiers = [pipelines[i] for i in range(6)]
labels = ['1: all words', 
          '2: porter stemmer',
          '3: snowball stemmer',
          '4: whitelist',
          '5: porter stemmer + whitelist',
          '6: snowball stemmer + whitelist',
          ]

for clf,col,ls,lab in zip(classifiers, colors, linestyles, labels):
    
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    cv = StratifiedKFold(y_train, n_folds=10, random_state=123)

    for i, (train, test) in enumerate(cv):
        probas_ = clf.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, 
             mean_tpr, 
             color=col, 
             linestyle=ls,
             label='%s (ROC AUC = %0.2f)' % (lab, mean_auc), 
             lw=2
    )

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')    
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.title('CounterVectorizer')
plt.tight_layout()
plt.savefig('./nb_whitelist_model_images/roc_gridsearch_1.eps', dpi=300)
plt.show()



In [18]:
colors = ['blue', 'red', 'black', 'green', 'cyan', 'magenta']
linestyles = ['-', '--', '-.', ':', '-', '--']
classifiers = [pipelines[i] for i in range(6,12)]
labels = ['1: all words', 
          '2: porter stemmer',
          '3: snowball stemmer',
          '4: whitelist',
          '5: porter stemmer + whitelist',
          '6: snowball stemmer + whitelist',
          ]

for clf,col,ls,lab in zip(classifiers, colors, linestyles, labels):
    
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    cv = StratifiedKFold(y_train, n_folds=10, random_state=123)

    for i, (train, test) in enumerate(cv):
        probas_ = clf.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, 
             mean_tpr, 
             color=col, 
             linestyle=ls,
             label='%s (ROC AUC = %0.2f)' % (lab, mean_auc), 
             lw=2
    )

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')    
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.title('TfidfVectorizer')
plt.tight_layout()
plt.savefig('./nb_whitelist_model_images/roc_gridsearch_2.eps', dpi=300)
plt.show()



In [31]:
colors = ['blue', 'red', 'black', 'green', 'cyan', 'magenta']
linestyles = ['-', '--', '-.', ':', '-', '--']
classifiers = [pipelines[i] for i in range(12,18)]
labels = ['1: all words', 
          '2: porter stemmer',
          '3: snowball stemmer',
          '4: whitelist',
          '5: porter stemmer + whitelist',
          '6: snowball stemmer + whitelist',
          ]

plt.figure(figsize=(8,6))
for clf,col,ls,lab in zip(classifiers, colors, linestyles, labels):
    
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    cv = StratifiedKFold(y_train, n_folds=10, random_state=123)

    for i, (train, test) in enumerate(cv):
        probas_ = clf.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, 
             mean_tpr, 
             color=col, 
             linestyle=ls,
             label='%s (ROC AUC = %0.2f)' % (lab, mean_auc), 
             lw=2
    )


plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')    
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.title('Bernoulli')
plt.tight_layout()
plt.savefig('./nb_whitelist_model_images/roc_gridsearch_3.eps', dpi=300)
plt.show()



In [20]:
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import StratifiedKFold
from scipy import interp

sns.set()
sns.set_style("whitegrid")

classifier = pipelines[7] # Tfidf + porter


cv = StratifiedKFold(y_train, n_folds=10, random_state=123)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

for i, (train, test) in enumerate(cv):
    probas_ = classifier.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc))

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')

mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operator Characteristic of the Lyrics Classifier')
plt.legend(loc="lower right")
plt.savefig('./nb_whitelist_model_images/roc_final.eps', dpi=300)
plt.show()



In [33]:
colors = ['blue', 'red', 'black', 'green', 'cyan', 'magenta']
linestyles = ['-', '--', '-.', ':', '-', '--']
classifiers = [pipelines[12], pipelines[0], pipelines[6]]
labels = ['1: BNB', 
          '2: MNB Tf',
          '3: MNB Tfidf',
          ]

plt.figure(figsize=(8,6))

for clf,col,ls,lab in zip(classifiers, colors, linestyles, labels):
    
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    cv = StratifiedKFold(y_train, n_folds=10, random_state=123)

    for i, (train, test) in enumerate(cv):
        probas_ = clf.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, 
             mean_tpr, 
             color=col, 
             linestyle=ls,
             label='%s (ROC AUC = %0.2f)' % (lab, mean_auc), 
             lw=2
    )

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')    
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.title('Different Naive Bayes Models')
plt.tight_layout()
plt.savefig('./nb_whitelist_model_images/roc_different_nb.eps', dpi=300)
plt.show()






Validation



Confusion matrix


In [21]:
import matplotlib as mpl
import numpy as np

def plot_cm(X, y, clf, title, dest):

    cm = metrics.confusion_matrix(y, clf.predict(X))

    np.set_printoptions(suppress=True)
    mpl.rc("figure", figsize=(4, 2))

    hm = sns.heatmap(cm, 
            cbar=False,
            annot=True, 
            square=True,
            fmt='d',
            yticklabels=['happy','sad'],
            xticklabels=['happy','sad'],
            cmap='Blues'
            )
    plt.title(title)
    plt.ylabel('actual class')
    plt.xlabel('predicted class')
    plt.tight_layout()
    plt.savefig(dest, dpi=300)
    plt.show()

In [22]:
clf = pipelines[7] # Tfidf + porter

clf.fit(X_train, y_train)

plot_cm(X_train, y_train, clf, 'Training - TfidfVectorizer Porter', './nb_whitelist_model_images/confmat_tfidf_porter_train.eps')



In [23]:
plot_cm(X_test, y_test, clf, 'Test - TfidfVectorizer Porter', './nb_whitelist_model_images/confmat_tfidf_porter_test.eps')




ROC AUC, Accuracy, Precision, Recall, and F1-score


In [16]:
from sklearn.naive_bayes import BernoulliNB

In [18]:
# Custom scorer methods to account for positive-negative class labels

from sklearn import metrics

# `pos_label` for positive class, since we have sad=1, happy=0

acc_scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True)
pre_scorer = metrics.make_scorer(metrics.precision_score, greater_is_better=True, pos_label=0)
rec_scorer = metrics.make_scorer(metrics.recall_score, greater_is_better=True, pos_label=0)
f1_scorer = metrics.make_scorer(metrics.f1_score, greater_is_better=True, pos_label=0)
auc_scorer = metrics.make_scorer(metrics.roc_auc_score, greater_is_better=True)

In [20]:
labels = ['BNB Training', 'BNB Training porter', 'BNB Training wl', 'BNB Training porter+wl',
          'MNB Training-CountVec', 'MNB Training-CountVec porter', 'MNB Training-CountVec wl', 'MNB Training-CountVec porter+wl',
          'MNB Training-TfidfVec', 'MNB Training-TfidfVec porter', 'MNB Training-TfidfVec wl', 'MNB Training-TfidfVec porter+wl',
          'BNB Test', 'BNB Test porter', 'BNB Test wl', 'BNB Test porter+wl',
          'MNB Test-CountVec', 'MNB Test-CountVec porter', 'MNB Test-CountVec wl', 'MNB Test-CountVec porter+wl',
          'MNB Test-TfidfVec', 'MNB Test-TfidfVec porter', 'MNB Test-TfidfVec wl', 'MNB Test-TfidfVec porter+wl'
          ]

d = {'Data':labels,
     'ACC (%)':[],
     'PRE (%)':[],
     'REC (%)':[],
     'F1 (%)':[],
     'ROC AUC (%)':[],
}


for clf in [clf_9, clf_10, clf_11, clf_12, clf_1, clf_2, clf_3, clf_4, clf_5, clf_6, clf_7, clf_8]:
    clf.fit(X_train, y_train)

for clf in [clf_9, clf_10, clf_11, clf_12, clf_1, clf_2, clf_3, clf_4, clf_5, clf_6, clf_7, clf_8]:

    d['ACC (%)'].append(acc_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['PRE (%)'].append(pre_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['REC (%)'].append(rec_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['F1 (%)'].append(f1_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_train, y_true=y_train))

for clf in [clf_9, clf_10, clf_11, clf_12, clf_1, clf_2, clf_3, clf_4, clf_5, clf_6, clf_7, clf_8]:  

    d['ACC (%)'].append(acc_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['PRE (%)'].append(pre_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['REC (%)'].append(rec_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['F1 (%)'].append(f1_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_test, y_true=y_test))

In [25]:
pd.set_option('precision', 2)

df_perform = pd.DataFrame(d)
df_perform = df_perform[['ACC (%)', 'PRE (%)', 'REC (%)', 'F1 (%)', 'ROC AUC (%)']]
df_perform.index=(labels)
df_perform = df_perform*100
df_perform = np.round(df_perform, decimals=2)
df_perform


Out[25]:
ACC (%) PRE (%) REC (%) F1 (%) ROC AUC (%)
BNB Training 96.5 93.8 98.7 96.2 96.7
BNB Training porter 93.4 89.3 96.9 92.9 93.7
BNB Training wl 81.1 72.1 94.0 81.6 82.3
BNB Training porter+wl 78.8 70.0 91.9 79.5 80.1
MNB Training-CountVec 95.7 94.1 96.4 95.2 95.8
MNB Training-CountVec porter 93.6 92.8 92.8 92.8 93.5
MNB Training-CountVec wl 81.2 75.8 85.0 80.1 81.6
MNB Training-CountVec porter+wl 80.5 75.9 82.5 79.0 80.7
MNB Training-TfidfVec 90.5 99.7 78.9 88.1 89.4
MNB Training-TfidfVec porter 86.2 99.0 69.7 81.8 84.6
MNB Training-TfidfVec wl 84.8 82.0 84.5 83.2 84.8
MNB Training-TfidfVec porter+wl 83.2 80.5 82.3 81.4 83.1
BNB Test 70.5 75.0 65.7 70.0 70.8
BNB Test porter 70.0 74.2 65.7 69.7 70.2
BNB Test wl 65.5 65.2 73.3 69.1 65.1
BNB Test porter+wl 67.0 67.0 73.3 70.0 66.7
MNB Test-CountVec 72.0 78.8 63.8 70.5 72.4
MNB Test-CountVec porter 68.0 76.0 57.1 65.2 68.6
MNB Test-CountVec wl 64.5 66.7 64.8 65.7 64.5
MNB Test-CountVec porter+wl 63.5 66.0 62.9 64.4 63.5
MNB Test-TfidfVec 59.5 80.0 30.5 44.1 61.0
MNB Test-TfidfVec porter 60.5 84.2 30.5 44.8 62.1
MNB Test-TfidfVec wl 65.5 70.5 59.0 64.2 65.8
MNB Test-TfidfVec porter+wl 63.5 68.6 56.2 61.8 63.9

In [28]:
df_perform.to_csv('./nb_whitelist_model_data/clf_performance.csv', index_label=False, float_format='%2.2f')



Cross Validation Scores


In [43]:
from sklearn import metrics
from sklearn import cross_validation

# Since cross-validation, use whole dataset
X_train = np.concatenate((X_train, X_test))
y_train = np.concatenate((y_train, y_test))

labels = ['CountVec', 'CountVec porter', 'CountVec wl', 'CountVec porter+wl',
             'TfidfVec', 'TfidfVec porter', 'TfidfVec wl', 'TfidfVec porter+wl']

d = {'Data':labels,
     'ACC (%)':[],
     'PRE (%)':[],
     'REC (%)':[],
     'F1 (%)':[],
     'ROC AUC (%)':[],
}

for clf in [clf_1, clf_2, clf_3, clf_4, clf_5, clf_6, clf_7, clf_8]:
    clf.fit(X_train, y_train)
    
   
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring=acc_scorer, cv=10)
    d['ACC (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))
    
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring=pre_scorer, cv=10)
    d['PRE (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))
    
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring=rec_scorer, cv=10)
    d['REC (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))
    
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring=f1_scorer, cv=10)
    d['F1 (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))
    
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring=auc_scorer, cv=10)
    d['ROC AUC (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))

In [44]:
df_perform = pd.DataFrame(d)
df_perform = df_perform[['ACC (%)', 'PRE (%)', 'REC (%)', 'F1 (%)', 'ROC AUC (%)']]
df_perform.index=(labels)
df_perform


Out[44]:
ACC (%) PRE (%) REC (%) F1 (%) ROC AUC (%)
CountVec 67.00 (+/- 4.07) 65.30 (+/- 4.80) 60.06 (+/- 6.20) 62.48 (+/- 5.17) 66.48 (+/- 4.16)
CountVec porter 67.66 (+/- 2.76) 66.78 (+/- 3.68) 59.16 (+/- 6.31) 62.54 (+/- 4.27) 67.02 (+/- 2.89)
CountVec wl 64.83 (+/- 5.30) 60.72 (+/- 5.07) 67.32 (+/- 8.33) 63.63 (+/- 5.60) 65.01 (+/- 5.26)
CountVec porter+wl 66.49 (+/- 4.26) 62.52 (+/- 4.70) 68.41 (+/- 7.88) 65.09 (+/- 4.93) 66.64 (+/- 4.26)
TfidfVec 67.75 (+/- 4.18) 76.67 (+/- 6.99) 42.63 (+/- 7.36) 54.56 (+/- 7.41) 65.85 (+/- 4.39)
TfidfVec porter 67.09 (+/- 4.30) 78.91 (+/- 6.08) 38.47 (+/- 9.51) 51.13 (+/- 9.57) 64.92 (+/- 4.69)
TfidfVec wl 64.41 (+/- 3.93) 61.30 (+/- 4.08) 61.70 (+/- 6.78) 61.32 (+/- 4.52) 64.21 (+/- 3.94)
TfidfVec porter+wl 66.75 (+/- 3.73) 63.76 (+/- 4.15) 64.61 (+/- 7.09) 63.96 (+/- 4.47) 66.59 (+/- 3.76)

In [ ]: