SVMs / KNNs on 20News datasets

Setups


In [11]:
from __future__ import print_function
import os
import time
import sys

#from tabulate import tabulate
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from sklearn import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score, train_test_split, KFold
#from gensim.models.word2vec import Word2Vec 

np.random.seed(1337)

MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1
DROP_OUT = 0.3
Nb_EPOCH = 10
BATCH_SIZE = 10
Classes = 20

BASE_DIR = '.'
GLOVE_DIR = BASE_DIR + '/glove.twitter.27B/'
FILENAME = 'glove.twitter.27B.' + str(EMBEDDING_DIM) + 'd.txt'
TEXT_DATA_DIR = BASE_DIR + '/20_newsgroups_'+ str(Classes)

1. Load Data


In [12]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                texts.append(f.read())
                f.close()
                labels.append(label_id)
print('Found %s texts.' % len(texts))
texts, labels = np.array(texts), np.array(labels)
print (texts.shape, labels.shape)


Found 19997 texts.
(19997,) (19997,)

2. Prepare word embedding


In [13]:
embeddings_index = {}
# fname = os.path.join(GLOVE_DIR, 'glove.twitter.27B.' + str(EMBEDDING_DIM) + 'd.txt')
fname = os.path.join(GLOVE_DIR, FILENAME)

with open(fname, "rb") as lines:
    word2vec = {line.split()[0]: np.array(map(float, line.split()[1:]))
               for line in lines}
    
f = open(fname)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
print ('Word2Vec: %s' % len(word2vec))


Found 1193514 word vectors.
Word2Vec: 1193514

3. Implement an embedding vector


In [14]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(word2vec.itervalues().next())
    
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.itervalues().next())
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

4. Create model structure (SVM/ KNN)


In [15]:
cntvect = feature_extraction.text.CountVectorizer(stop_words='english')                                                    
trainX, valX, trainY, valY = train_test_split(texts, labels, test_size=VALIDATION_SPLIT)

In [16]:
svc_glove27B = Pipeline([("count_vectorizer", MeanEmbeddingVectorizer(embeddings_index)), ("linear svc", SVC(kernel="linear"))])
knn_glove27B = Pipeline([("count_vectorizer", MeanEmbeddingVectorizer(embeddings_index)), ("KNN", neighbors.KNeighborsClassifier())])
svc_glove27B_tfidf = Pipeline([("tfidf_vectorizer", TfidfEmbeddingVectorizer(embeddings_index)), ("linear svc", SVC(kernel="linear"))])
knn_glove27B_tfidf = Pipeline([("tfidf_vectorizer", TfidfEmbeddingVectorizer(embeddings_index)), ("KNN", neighbors.KNeighborsClassifier())])

In [17]:
all_models = [
    ("SVM", svc_glove27B),
    ("KNN", knn_glove27B),
    ("SVM-TFIDF", svc_glove27B_tfidf),
    ("KNN-TFIDF", knn_glove27B_tfidf),  
]
scores = sorted([(name, cross_val_score(model, trainX, trainY, cv=5).mean()) 
                 for name, model in all_models], 
                key=lambda (_, x): -x)

print (scores)
#print tabulate(scores, floatfmt=".4f.79055799899004708), ('KNN'", headers=("model", 'score'))


[('SVM-TFIDF', 0.26220998267046558), ('SVM', 0.25804037409959513), ('KNN-TFIDF', 0.25732287998131081), ('KNN', 0.25371472628168207)]

In [18]:
def benchmark(model, X, y):
    skf = KFold(len(X), n_folds=5, shuffle=True, random_state= 1337)
    scores = []
    for train, test in skf:
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        scores.append(accuracy_score(model.fit(X_train, y_train).predict(X_test), y_test))
    return np.mean(scores)

#benchmark(model, texts, labels)

In [19]:
table = []
for name, model in all_models:
    table.append({'model': name, 
                    'accuracy': benchmark(model, texts, labels)})
df = pd.DataFrame(table)
print (df)


   accuracy      model
0  0.264689        SVM
1  0.257188        KNN
2  0.271291  SVM-TFIDF
3  0.266340  KNN-TFIDF

In [ ]:
# classes: 5
#      accuracy      model
# 0  0.562331        SVM
# 1  0.534512        KNN
# 2  0.570139  SVM-TFIDF
# 3  0.533509  KNN-TFIDF

# classes: 10
#       accuracy      model
# 0  0.416827        SVM
# 1  0.417126        KNN
# 2  0.424729  SVM-TFIDF
# 3  0.422528  KNN-TFIDF

# classes: 15
#  accuracy      model
# 0  0.301223        SVM
# 1  0.304939        KNN
# 2  0.305005  SVM-TFIDF
# 3  0.313923  KNN-TFIDF

# classes: 20
# accuracy      model
# 0  0.264689        SVM
# 1  0.257188        KNN
# 2  0.271291  SVM-TFIDF
# 3  0.266340  KNN-TFIDF

In [ ]: