In [1]:

    
%load_ext watermark



In [2]:

    
%watermark -d -v -a 'Sebastian Raschka' -p scikit-learn,nltk,numpy









    



Sebastian Raschka 13/12/2014 

CPython 3.4.2
IPython 2.3.0

scikit-learn 0.15.2
nltk 3.0.0
numpy 1.9.1

[More information](https://github.com/rasbt/watermark) about the `watermark` magic command extension.

Lyrics Mood Classification - Training

Sections

Preprocessing
Model Selection
Validation
- Confusion matrix
- ROC AUC, Accuracy, Precision, Recall, and F1-score
Save classifier

Preprocessing

[back to top]

Reading the dataset

[back to top]



In [3]:

    
import pandas as pd

df_train = pd.read_csv('../../dataset/training/train_lyrics_1000.csv')
df_test = pd.read_csv('../../dataset/validation/valid_lyrics_200.csv')

X_train = df_train['lyrics'].values 
y_train = df_train['mood'].values

X_test = df_test['lyrics'].values 
y_test = df_test['mood'].values

df_train.head()









    Out[3]:






  
    
      
      file
      artist
      title
      lyrics
      mood
      year
    
  
  
    
      0
       TRAAAAW128F429D538.h5
              Casual
              I Didn't Mean To
       Verse One:\n\nAlright I might\nHave had a litt...
         sad
       1994
    
    
      1
       TRAAAEF128F4273421.h5
            Adam Ant
               Something Girls
       Adam Ant/Marco Pirroni\nEvery girl is a someth...
       happy
       1982
    
    
      2
       TRAAAFD128F92F423A.h5
                 Gob
                Face the Ashes
       I've just erased it's been a while, I've got a...
         sad
       2007
    
    
      3
       TRAABJV128F1460C49.h5
       Lionel Richie
       Tonight Will Be Alright
       Little darling \nWhere you've been so long \nI...
       happy
       1986
    
    
      4
       TRAABLR128F423B7E3.h5
          Blue Rodeo
                      Floating
       Lead Vocal by Greg\n\nWell, these late night c...
         sad
       1987

Label Encoder

[back to top]



In [4]:

    
import pickle
import numpy as np

pickle_in = open('./label_encoder.p', 'rb')
le = pickle.load(pickle_in)
pickle_in.close()

print('before: %s ...' %y_train[:5])

y_train = le.transform(y_train)
y_test = le.transform(y_test)

print('after: %s ...' %y_train[:5])









    



before: ['sad' 'happy' 'sad' 'happy' 'sad'] ...
after: [1 0 1 0 1] ...

Stop word list

[back to top]



In [5]:

    
with open('./stopwords_eng.txt') as sw:
    stopwords = sw.read().split()
stopwords[-10:]









    Out[5]:





['too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']

Feature extraction: Word counts and Vectorizers

[back to top]

Initial vocabularies

[back to top]



In [6]:

    
# With Porter Stemming
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
porter = PorterStemmer()

import re

vec_porter = CountVectorizer(analyzer='word',
                      decode_error='replace',
                      tokenizer=lambda text: text.split(),
                      preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), 
                      stop_words=stopwords,) 


vec_porter.fit(X_train)

print('Vocabular size: %s words' % len(vec_porter.vocabulary_))

vocab_1 = vec_porter.get_feature_names()
print(vocab_1[-10:])









    



Vocabular size: 11378 words
['zipped', 'zippin', 'zipping', 'zippo', 'zodiac', 'zoli', 'zombie', 'zone', 'zonin', 'zoo']



In [7]:

    
# With Porter Stemming

porter = PorterStemmer()

vec_porter = CountVectorizer(analyzer='word',
                      decode_error='replace',
                      tokenizer=lambda text: [porter.stem(word) for word in text.split()],
                      preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), 
                      stop_words=stopwords,) 


vec_porter.fit(X_train)

print('Vocabular size: %s words' % len(vec_porter.vocabulary_))

vocab_2 = vec_porter.get_feature_names()
print(vocab_2[-10:])









    



Vocabular size: 8551 words
['zip', 'ziplock', 'zippin', 'zippo', 'zodiac', 'zoli', 'zombi', 'zone', 'zonin', 'zoo']

Whitelists of positive and negative words

[back to top]



In [8]:

    
with open('./whitelist_dicts/negative-words.txt', 'r') as w:
    negative_words = w.read().split()
print('number of negative words: %s' % len(negative_words))
negative_words[-5:]









    



number of negative words: 4783






    Out[8]:





['zaps', 'zealot', 'zealous', 'zealously', 'zombie']



In [9]:

    
with open('./whitelist_dicts/positive-words.txt', 'r') as w:
    positive_words = w.read().split()
print('number of positive words: %s' % len(positive_words))
positive_words[-5:]









    



number of positive words: 2006






    Out[9]:





['youthful', 'zeal', 'zenith', 'zest', 'zippy']



In [10]:

    
negative_words = set(negative_words)
positive_words = set(positive_words)
semantic_words = negative_words.union(positive_words)

#pickle.dump(semantic_words, open('./whitelist_dicts/semantic_words.p', 'wb'))



In [11]:

    
# With whitelist 

# With Porter Stemming
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
import re

porter = PorterStemmer()

vec_porter = CountVectorizer(analyzer='word',
                      decode_error='replace',
                      tokenizer=lambda text: [word for word in text.split() if word in semantic_words],
                      preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), 
                      stop_words=stopwords,) 


vec_porter.fit(X_train)

print('Vocabular size: %s words' % len(vec_porter.vocabulary_))

vocab_3 = vec_porter.get_feature_names()
print(vocab_3[-10:])









    



Vocabular size: 1666 words
['wrath', 'wreck', 'wrest', 'wrestle', 'wretched', 'wrong', 'yawn', 'yay', 'zest', 'zombie']



In [12]:

    
# With whitelist and Porter Stemming

# With Porter Stemming
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
import re

porter = PorterStemmer()

vec_porter = CountVectorizer(analyzer='word',
                      decode_error='replace',
                      tokenizer=lambda text: [porter.stem(word) for word in text.split() if word in semantic_words],
                      preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), 
                      stop_words=stopwords,) 


vec_porter.fit(X_train)

print('Vocabular size: %s words' % len(vec_porter.vocabulary_))

vocab_4 = vec_porter.get_feature_names()
print(vocab_4[-10:])









    



Vocabular size: 1349 words
['wrath', 'wreck', 'wrest', 'wrestl', 'wretch', 'wrong', 'yawn', 'yay', 'zest', 'zombi']

Summary vocabulary sizes

[back to top]



In [14]:

    
print('All words: %s ' % len(vocab_1))
print('Porter: %s ' % len(vocab_2))
print('Whitelist: %s ' % len(vocab_3))
print('Porter + Whitelist: %s ' % len(vocab_4))









    



All words: 11378 
Porter: 8551 
Whitelist: 1666 
Porter + Whitelist: 1349

Model Selection

[back to top]

Setup

[back to top]



In [13]:

    
# Requirements

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV
from nltk.stem import PorterStemmer
import re
import pickle
from sklearn import metrics

porter = PorterStemmer()

semantic_words = pickle.load(open('./whitelist_dicts/semantic_words.p', 'rb'))

with open('./stopwords_eng.txt') as sw:
    stop_words = sw.read().split()
    
f1_scorer = metrics.make_scorer(metrics.f1_score, greater_is_better=True, pos_label=0)

Grid Search

[back to top]



In [14]:

    
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer

porter = PorterStemmer()
snowball = EnglishStemmer()

# raw words
tokenizer = lambda text: text.split()

# words after Porter stemming 
tokenizer_porter = lambda text: [porter.stem(word) for word in text.split()]

# Words after Snowball stemming
tokenizer_snowball = lambda text: [snowball.stem(word) for word in text.split()]

# Only words that are in a list of 'positive' or 'negative' words ('whitelist')
# http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon
tokenizer_whitelist = lambda text: [word for word in text.split() if word in semantic_words]

# Porter-stemmed words in whitelist
tokenizer_porter_wl = lambda text: [porter.stem(word) for word in text.split() if word in semantic_words]

# Snowball-stemmed words in whitelist
tokenizer_snowball_wl = lambda text: [snowball.stem(word) for word in text.split() if word in semantic_words]



In [86]:

    
pipeline_1 = Pipeline([
    ('vect', CountVectorizer(binary=False,
                             stop_words=stop_words,
                             ngram_range=(1,1),
                             preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                             )),
    ('clf', MultinomialNB(fit_prior=False, 
                          alpha=1.0))
])

parameters_1 = dict(
    vect__tokenizer=[tokenizer, tokenizer_porter, tokenizer_whitelist, tokenizer_porter_wl],
)

grid_search_1 = GridSearchCV(pipeline_1, 
                           parameters_1, 
                           n_jobs=1, 
                           verbose=1,
                           scoring=f1_scorer,
                           cv=10
                )


print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_1.steps])
grid_search_1.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search_1.best_score_)
print("Best parameters set:")
best_parameters_1 = grid_search_1.best_estimator_.get_params()
for param_name in sorted(parameters_1.keys()):
    print("\t%s: %r" % (param_name, best_parameters_1[param_name]))









    



[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    1.2s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.2min finished






    



Performing grid search...
pipeline: ['vect', 'clf']
Fitting 10 folds for each of 4 candidates, totalling 40 fits
Best score: 0.632
Best parameters set:
	vect__tokenizer: <function <lambda> at 0x10a4d2f28>



In [87]:

    
pipeline_2 = Pipeline([
    ('vect', TfidfVectorizer(binary=False,
                             stop_words=stop_words,
                             ngram_range=(1,1),
                             preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                             )),
    ('clf', MultinomialNB(fit_prior=False, 
                          alpha=1.0))
])

parameters_2 = dict(
    vect__tokenizer=[tokenizer, tokenizer_porter, tokenizer_whitelist, tokenizer_porter_wl],
)

grid_search_2 = GridSearchCV(pipeline_2, 
                           parameters_2, 
                           n_jobs=1, 
                           verbose=1,
                           scoring=f1_scorer,
                           cv=10
                )


print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_2.steps])
grid_search_2.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search_2.best_score_)
print("Best parameters set:")
best_parameters_2 = grid_search_2.best_estimator_.get_params()
for param_name in sorted(parameters_2.keys()):
    print("\t%s: %r" % (param_name, best_parameters_1[param_name]))









    



[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    1.0s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.0min finished






    



Performing grid search...
pipeline: ['vect', 'clf']
Fitting 10 folds for each of 4 candidates, totalling 40 fits
Best score: 0.606
Best parameters set:
	vect__tokenizer: <function <lambda> at 0x10a4d2f28>

ROC Curves

[back to top]



In [15]:

    
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import StratifiedKFold
from scipy import interp

sns.set()
sns.set_style("whitegrid")

vect_1 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer)

vect_2 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter)
    
vect_3 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball)  

vect_4 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_whitelist)  

vect_5 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter_wl)

vect_6 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball_wl)

vect_7 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer)

vect_8 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter)
    
vect_9 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball)

vect_10 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_whitelist)    

vect_11 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter_wl)

vect_12 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball_wl)

pipelines = []
vectorizers = [vect_1, vect_2, vect_3, vect_4, vect_5, vect_6, vect_7, vect_8, vect_9, vect_10, vect_11, vect_12]
for v in vectorizers:
    pipelines.append(Pipeline([('vect', v),
                               ('clf', MultinomialNB(fit_prior=False, alpha=1.0))]))
    
for v in vectorizers[:6]:
    pipelines.append(Pipeline([('vect', v),
                               ('clf', BernoulliNB(fit_prior=False, alpha=1.0))]))



In [17]:

    
colors = ['blue', 'red', 'black', 'green', 'cyan', 'magenta']
linestyles = ['-', '--', '-.', ':', '-', '--']
classifiers = [pipelines[i] for i in range(6)]
labels = ['1: all words', 
          '2: porter stemmer',
          '3: snowball stemmer',
          '4: whitelist',
          '5: porter stemmer + whitelist',
          '6: snowball stemmer + whitelist',
          ]

for clf,col,ls,lab in zip(classifiers, colors, linestyles, labels):
    
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    cv = StratifiedKFold(y_train, n_folds=10, random_state=123)

    for i, (train, test) in enumerate(cv):
        probas_ = clf.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, 
             mean_tpr, 
             color=col, 
             linestyle=ls,
             label='%s (ROC AUC = %0.2f)' % (lab, mean_auc), 
             lw=2
    )

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')    
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.title('CounterVectorizer')
plt.tight_layout()
plt.savefig('./nb_whitelist_model_images/roc_gridsearch_1.eps', dpi=300)
plt.show()



In [18]:

    
colors = ['blue', 'red', 'black', 'green', 'cyan', 'magenta']
linestyles = ['-', '--', '-.', ':', '-', '--']
classifiers = [pipelines[i] for i in range(6,12)]
labels = ['1: all words', 
          '2: porter stemmer',
          '3: snowball stemmer',
          '4: whitelist',
          '5: porter stemmer + whitelist',
          '6: snowball stemmer + whitelist',
          ]

for clf,col,ls,lab in zip(classifiers, colors, linestyles, labels):
    
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    cv = StratifiedKFold(y_train, n_folds=10, random_state=123)

    for i, (train, test) in enumerate(cv):
        probas_ = clf.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, 
             mean_tpr, 
             color=col, 
             linestyle=ls,
             label='%s (ROC AUC = %0.2f)' % (lab, mean_auc), 
             lw=2
    )

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')    
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.title('TfidfVectorizer')
plt.tight_layout()
plt.savefig('./nb_whitelist_model_images/roc_gridsearch_2.eps', dpi=300)
plt.show()



In [31]:

    
colors = ['blue', 'red', 'black', 'green', 'cyan', 'magenta']
linestyles = ['-', '--', '-.', ':', '-', '--']
classifiers = [pipelines[i] for i in range(12,18)]
labels = ['1: all words', 
          '2: porter stemmer',
          '3: snowball stemmer',
          '4: whitelist',
          '5: porter stemmer + whitelist',
          '6: snowball stemmer + whitelist',
          ]

plt.figure(figsize=(8,6))
for clf,col,ls,lab in zip(classifiers, colors, linestyles, labels):
    
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    cv = StratifiedKFold(y_train, n_folds=10, random_state=123)

    for i, (train, test) in enumerate(cv):
        probas_ = clf.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, 
             mean_tpr, 
             color=col, 
             linestyle=ls,
             label='%s (ROC AUC = %0.2f)' % (lab, mean_auc), 
             lw=2
    )


plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')    
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.title('Bernoulli')
plt.tight_layout()
plt.savefig('./nb_whitelist_model_images/roc_gridsearch_3.eps', dpi=300)
plt.show()



In [20]:

    
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import StratifiedKFold
from scipy import interp

sns.set()
sns.set_style("whitegrid")

classifier = pipelines[7] # Tfidf + porter


cv = StratifiedKFold(y_train, n_folds=10, random_state=123)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

for i, (train, test) in enumerate(cv):
    probas_ = classifier.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc))

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')

mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operator Characteristic of the Lyrics Classifier')
plt.legend(loc="lower right")
plt.savefig('./nb_whitelist_model_images/roc_final.eps', dpi=300)
plt.show()



In [33]:

    
colors = ['blue', 'red', 'black', 'green', 'cyan', 'magenta']
linestyles = ['-', '--', '-.', ':', '-', '--']
classifiers = [pipelines[12], pipelines[0], pipelines[6]]
labels = ['1: BNB', 
          '2: MNB Tf',
          '3: MNB Tfidf',
          ]

plt.figure(figsize=(8,6))

for clf,col,ls,lab in zip(classifiers, colors, linestyles, labels):
    
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    cv = StratifiedKFold(y_train, n_folds=10, random_state=123)

    for i, (train, test) in enumerate(cv):
        probas_ = clf.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, 
             mean_tpr, 
             color=col, 
             linestyle=ls,
             label='%s (ROC AUC = %0.2f)' % (lab, mean_auc), 
             lw=2
    )

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')    
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.title('Different Naive Bayes Models')
plt.tight_layout()
plt.savefig('./nb_whitelist_model_images/roc_different_nb.eps', dpi=300)
plt.show()

Validation

[back to top]

Confusion matrix

[back to top]



In [21]:

    
import matplotlib as mpl
import numpy as np

def plot_cm(X, y, clf, title, dest):

    cm = metrics.confusion_matrix(y, clf.predict(X))

    np.set_printoptions(suppress=True)
    mpl.rc("figure", figsize=(4, 2))

    hm = sns.heatmap(cm, 
            cbar=False,
            annot=True, 
            square=True,
            fmt='d',
            yticklabels=['happy','sad'],
            xticklabels=['happy','sad'],
            cmap='Blues'
            )
    plt.title(title)
    plt.ylabel('actual class')
    plt.xlabel('predicted class')
    plt.tight_layout()
    plt.savefig(dest, dpi=300)
    plt.show()



In [22]:

    
clf = pipelines[7] # Tfidf + porter

clf.fit(X_train, y_train)

plot_cm(X_train, y_train, clf, 'Training - TfidfVectorizer Porter', './nb_whitelist_model_images/confmat_tfidf_porter_train.eps')



In [23]:

    
plot_cm(X_test, y_test, clf, 'Test - TfidfVectorizer Porter', './nb_whitelist_model_images/confmat_tfidf_porter_test.eps')

ROC AUC, Accuracy, Precision, Recall, and F1-score

[back to top]



In [16]:

    
from sklearn.naive_bayes import BernoulliNB



In [18]:

    
# Custom scorer methods to account for positive-negative class labels

from sklearn import metrics

# `pos_label` for positive class, since we have sad=1, happy=0

acc_scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True)
pre_scorer = metrics.make_scorer(metrics.precision_score, greater_is_better=True, pos_label=0)
rec_scorer = metrics.make_scorer(metrics.recall_score, greater_is_better=True, pos_label=0)
f1_scorer = metrics.make_scorer(metrics.f1_score, greater_is_better=True, pos_label=0)
auc_scorer = metrics.make_scorer(metrics.roc_auc_score, greater_is_better=True)



In [20]:

    
labels = ['BNB Training', 'BNB Training porter', 'BNB Training wl', 'BNB Training porter+wl',
          'MNB Training-CountVec', 'MNB Training-CountVec porter', 'MNB Training-CountVec wl', 'MNB Training-CountVec porter+wl',
          'MNB Training-TfidfVec', 'MNB Training-TfidfVec porter', 'MNB Training-TfidfVec wl', 'MNB Training-TfidfVec porter+wl',
          'BNB Test', 'BNB Test porter', 'BNB Test wl', 'BNB Test porter+wl',
          'MNB Test-CountVec', 'MNB Test-CountVec porter', 'MNB Test-CountVec wl', 'MNB Test-CountVec porter+wl',
          'MNB Test-TfidfVec', 'MNB Test-TfidfVec porter', 'MNB Test-TfidfVec wl', 'MNB Test-TfidfVec porter+wl'
          ]

d = {'Data':labels,
     'ACC (%)':[],
     'PRE (%)':[],
     'REC (%)':[],
     'F1 (%)':[],
     'ROC AUC (%)':[],
}


for clf in [clf_9, clf_10, clf_11, clf_12, clf_1, clf_2, clf_3, clf_4, clf_5, clf_6, clf_7, clf_8]:
    clf.fit(X_train, y_train)

for clf in [clf_9, clf_10, clf_11, clf_12, clf_1, clf_2, clf_3, clf_4, clf_5, clf_6, clf_7, clf_8]:

    d['ACC (%)'].append(acc_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['PRE (%)'].append(pre_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['REC (%)'].append(rec_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['F1 (%)'].append(f1_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_train, y_true=y_train))

for clf in [clf_9, clf_10, clf_11, clf_12, clf_1, clf_2, clf_3, clf_4, clf_5, clf_6, clf_7, clf_8]:  

    d['ACC (%)'].append(acc_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['PRE (%)'].append(pre_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['REC (%)'].append(rec_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['F1 (%)'].append(f1_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_test, y_true=y_test))



In [25]:

    
pd.set_option('precision', 2)

df_perform = pd.DataFrame(d)
df_perform = df_perform[['ACC (%)', 'PRE (%)', 'REC (%)', 'F1 (%)', 'ROC AUC (%)']]
df_perform.index=(labels)
df_perform = df_perform*100
df_perform = np.round(df_perform, decimals=2)
df_perform









    Out[25]:






  
    
      
      ACC (%)
      PRE (%)
      REC (%)
      F1 (%)
      ROC AUC (%)
    
  
  
    
      BNB Training
       96.5
       93.8
       98.7
       96.2
       96.7
    
    
      BNB Training porter
       93.4
       89.3
       96.9
       92.9
       93.7
    
    
      BNB Training wl
       81.1
       72.1
       94.0
       81.6
       82.3
    
    
      BNB Training porter+wl
       78.8
       70.0
       91.9
       79.5
       80.1
    
    
      MNB Training-CountVec
       95.7
       94.1
       96.4
       95.2
       95.8
    
    
      MNB Training-CountVec porter
       93.6
       92.8
       92.8
       92.8
       93.5
    
    
      MNB Training-CountVec wl
       81.2
       75.8
       85.0
       80.1
       81.6
    
    
      MNB Training-CountVec porter+wl
       80.5
       75.9
       82.5
       79.0
       80.7
    
    
      MNB Training-TfidfVec
       90.5
       99.7
       78.9
       88.1
       89.4
    
    
      MNB Training-TfidfVec porter
       86.2
       99.0
       69.7
       81.8
       84.6
    
    
      MNB Training-TfidfVec wl
       84.8
       82.0
       84.5
       83.2
       84.8
    
    
      MNB Training-TfidfVec porter+wl
       83.2
       80.5
       82.3
       81.4
       83.1
    
    
      BNB Test
       70.5
       75.0
       65.7
       70.0
       70.8
    
    
      BNB Test porter
       70.0
       74.2
       65.7
       69.7
       70.2
    
    
      BNB Test wl
       65.5
       65.2
       73.3
       69.1
       65.1
    
    
      BNB Test porter+wl
       67.0
       67.0
       73.3
       70.0
       66.7
    
    
      MNB Test-CountVec
       72.0
       78.8
       63.8
       70.5
       72.4
    
    
      MNB Test-CountVec porter
       68.0
       76.0
       57.1
       65.2
       68.6
    
    
      MNB Test-CountVec wl
       64.5
       66.7
       64.8
       65.7
       64.5
    
    
      MNB Test-CountVec porter+wl
       63.5
       66.0
       62.9
       64.4
       63.5
    
    
      MNB Test-TfidfVec
       59.5
       80.0
       30.5
       44.1
       61.0
    
    
      MNB Test-TfidfVec porter
       60.5
       84.2
       30.5
       44.8
       62.1
    
    
      MNB Test-TfidfVec wl
       65.5
       70.5
       59.0
       64.2
       65.8
    
    
      MNB Test-TfidfVec porter+wl
       63.5
       68.6
       56.2
       61.8
       63.9



In [28]:

    
df_perform.to_csv('./nb_whitelist_model_data/clf_performance.csv', index_label=False, float_format='%2.2f')

Cross Validation Scores



In [43]:

    
from sklearn import metrics
from sklearn import cross_validation

# Since cross-validation, use whole dataset
X_train = np.concatenate((X_train, X_test))
y_train = np.concatenate((y_train, y_test))

labels = ['CountVec', 'CountVec porter', 'CountVec wl', 'CountVec porter+wl',
             'TfidfVec', 'TfidfVec porter', 'TfidfVec wl', 'TfidfVec porter+wl']

d = {'Data':labels,
     'ACC (%)':[],
     'PRE (%)':[],
     'REC (%)':[],
     'F1 (%)':[],
     'ROC AUC (%)':[],
}

for clf in [clf_1, clf_2, clf_3, clf_4, clf_5, clf_6, clf_7, clf_8]:
    clf.fit(X_train, y_train)
    
   
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring=acc_scorer, cv=10)
    d['ACC (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))
    
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring=pre_scorer, cv=10)
    d['PRE (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))
    
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring=rec_scorer, cv=10)
    d['REC (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))
    
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring=f1_scorer, cv=10)
    d['F1 (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))
    
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring=auc_scorer, cv=10)
    d['ROC AUC (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))



In [44]:

    
df_perform = pd.DataFrame(d)
df_perform = df_perform[['ACC (%)', 'PRE (%)', 'REC (%)', 'F1 (%)', 'ROC AUC (%)']]
df_perform.index=(labels)
df_perform









    Out[44]:






  
    
      
      ACC (%)
      PRE (%)
      REC (%)
      F1 (%)
      ROC AUC (%)
    
  
  
    
      CountVec
       67.00 (+/- 4.07)
       65.30 (+/- 4.80)
       60.06 (+/- 6.20)
       62.48 (+/- 5.17)
       66.48 (+/- 4.16)
    
    
      CountVec porter
       67.66 (+/- 2.76)
       66.78 (+/- 3.68)
       59.16 (+/- 6.31)
       62.54 (+/- 4.27)
       67.02 (+/- 2.89)
    
    
      CountVec wl
       64.83 (+/- 5.30)
       60.72 (+/- 5.07)
       67.32 (+/- 8.33)
       63.63 (+/- 5.60)
       65.01 (+/- 5.26)
    
    
      CountVec porter+wl
       66.49 (+/- 4.26)
       62.52 (+/- 4.70)
       68.41 (+/- 7.88)
       65.09 (+/- 4.93)
       66.64 (+/- 4.26)
    
    
      TfidfVec
       67.75 (+/- 4.18)
       76.67 (+/- 6.99)
       42.63 (+/- 7.36)
       54.56 (+/- 7.41)
       65.85 (+/- 4.39)
    
    
      TfidfVec porter
       67.09 (+/- 4.30)
       78.91 (+/- 6.08)
       38.47 (+/- 9.51)
       51.13 (+/- 9.57)
       64.92 (+/- 4.69)
    
    
      TfidfVec wl
       64.41 (+/- 3.93)
       61.30 (+/- 4.08)
       61.70 (+/- 6.78)
       61.32 (+/- 4.52)
       64.21 (+/- 3.94)
    
    
      TfidfVec porter+wl
       66.75 (+/- 3.73)
       63.76 (+/- 4.15)
       64.61 (+/- 7.09)
       63.96 (+/- 4.47)
       66.59 (+/- 3.76)



In [ ]:

	file	artist	title	lyrics	mood	year
0	TRAAAAW128F429D538.h5	Casual	I Didn't Mean To	Verse One:\n\nAlright I might\nHave had a litt...	sad	1994
1	TRAAAEF128F4273421.h5	Adam Ant	Something Girls	Adam Ant/Marco Pirroni\nEvery girl is a someth...	happy	1982
2	TRAAAFD128F92F423A.h5	Gob	Face the Ashes	I've just erased it's been a while, I've got a...	sad	2007
3	TRAABJV128F1460C49.h5	Lionel Richie	Tonight Will Be Alright	Little darling \nWhere you've been so long \nI...	happy	1986
4	TRAABLR128F423B7E3.h5	Blue Rodeo	Floating	Lead Vocal by Greg\n\nWell, these late night c...	sad	1987

	ACC (%)	PRE (%)	REC (%)	F1 (%)	ROC AUC (%)
BNB Training	96.5	93.8	98.7	96.2	96.7
BNB Training porter	93.4	89.3	96.9	92.9	93.7
BNB Training wl	81.1	72.1	94.0	81.6	82.3
BNB Training porter+wl	78.8	70.0	91.9	79.5	80.1
MNB Training-CountVec	95.7	94.1	96.4	95.2	95.8
MNB Training-CountVec porter	93.6	92.8	92.8	92.8	93.5
MNB Training-CountVec wl	81.2	75.8	85.0	80.1	81.6
MNB Training-CountVec porter+wl	80.5	75.9	82.5	79.0	80.7
MNB Training-TfidfVec	90.5	99.7	78.9	88.1	89.4
MNB Training-TfidfVec porter	86.2	99.0	69.7	81.8	84.6
MNB Training-TfidfVec wl	84.8	82.0	84.5	83.2	84.8
MNB Training-TfidfVec porter+wl	83.2	80.5	82.3	81.4	83.1
BNB Test	70.5	75.0	65.7	70.0	70.8
BNB Test porter	70.0	74.2	65.7	69.7	70.2
BNB Test wl	65.5	65.2	73.3	69.1	65.1
BNB Test porter+wl	67.0	67.0	73.3	70.0	66.7
MNB Test-CountVec	72.0	78.8	63.8	70.5	72.4
MNB Test-CountVec porter	68.0	76.0	57.1	65.2	68.6
MNB Test-CountVec wl	64.5	66.7	64.8	65.7	64.5
MNB Test-CountVec porter+wl	63.5	66.0	62.9	64.4	63.5
MNB Test-TfidfVec	59.5	80.0	30.5	44.1	61.0
MNB Test-TfidfVec porter	60.5	84.2	30.5	44.8	62.1
MNB Test-TfidfVec wl	65.5	70.5	59.0	64.2	65.8
MNB Test-TfidfVec porter+wl	63.5	68.6	56.2	61.8	63.9

	ACC (%)	PRE (%)	REC (%)	F1 (%)	ROC AUC (%)
CountVec	67.00 (+/- 4.07)	65.30 (+/- 4.80)	60.06 (+/- 6.20)	62.48 (+/- 5.17)	66.48 (+/- 4.16)
CountVec porter	67.66 (+/- 2.76)	66.78 (+/- 3.68)	59.16 (+/- 6.31)	62.54 (+/- 4.27)	67.02 (+/- 2.89)
CountVec wl	64.83 (+/- 5.30)	60.72 (+/- 5.07)	67.32 (+/- 8.33)	63.63 (+/- 5.60)	65.01 (+/- 5.26)
CountVec porter+wl	66.49 (+/- 4.26)	62.52 (+/- 4.70)	68.41 (+/- 7.88)	65.09 (+/- 4.93)	66.64 (+/- 4.26)
TfidfVec	67.75 (+/- 4.18)	76.67 (+/- 6.99)	42.63 (+/- 7.36)	54.56 (+/- 7.41)	65.85 (+/- 4.39)
TfidfVec porter	67.09 (+/- 4.30)	78.91 (+/- 6.08)	38.47 (+/- 9.51)	51.13 (+/- 9.57)	64.92 (+/- 4.69)
TfidfVec wl	64.41 (+/- 3.93)	61.30 (+/- 4.08)	61.70 (+/- 6.78)	61.32 (+/- 4.52)	64.21 (+/- 3.94)
TfidfVec porter+wl	66.75 (+/- 3.73)	63.76 (+/- 4.15)	64.61 (+/- 7.09)	63.96 (+/- 4.47)	66.59 (+/- 3.76)