SVMs / KNNs on yahoo datasets

Setups


In [1]:
from __future__ import print_function
import os
import time
import sys

#from tabulate import tabulate
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from sklearn import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score, train_test_split, KFold
#from gensim.models.word2vec import Word2Vec 

np.random.seed(1337)

MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1
DROP_OUT = 0.3
Nb_EPOCH = 30
BATCH_SIZE = 50
Classes = 180

GLOVE_DIR = './glove.6B/'
FILENAME = 'glove.6B.' + str(EMBEDDING_DIM) + 'd.txt'
TEXT_DATA_DIR = './yahoo_' + str(Classes)


/home/irisliu/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/irisliu/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)
/home/irisliu/anaconda2/lib/python2.7/site-packages/sklearn/lda.py:6: DeprecationWarning: lda.LDA has been moved to discriminant_analysis.LinearDiscriminantAnalysis in 0.17 and will be removed in 0.19
  "in 0.17 and will be removed in 0.19", DeprecationWarning)
/home/irisliu/anaconda2/lib/python2.7/site-packages/sklearn/learning_curve.py:23: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the functions are moved. This module will be removed in 0.20
  DeprecationWarning)
/home/irisliu/anaconda2/lib/python2.7/site-packages/sklearn/qda.py:6: DeprecationWarning: qda.QDA has been moved to discriminant_analysis.QuadraticDiscriminantAnalysis in 0.17 and will be removed in 0.19.
  "in 0.17 and will be removed in 0.19.", DeprecationWarning)

1. Load Data


In [2]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                texts.append(f.read())
                f.close()
                labels.append(label_id)
print('Found %s texts.' % len(texts))
texts, labels = np.array(texts), np.array(labels)
print (texts.shape, labels.shape)


Found 112500 texts.
---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-2-02731b933479> in <module>()
     18                 labels.append(label_id)
     19 print('Found %s texts.' % len(texts))
---> 20 texts, labels = np.array(texts), np.array(labels)
     21 print (texts.shape, labels.shape)

MemoryError: 

2. Prepare word embedding


In [ ]:
embeddings_index = {}
# fname = os.path.join(GLOVE_DIR, 'glove.twitter.27B.' + str(EMBEDDING_DIM) + 'd.txt')
fname = os.path.join(GLOVE_DIR, FILENAME)

with open(fname, "rb") as lines:
    word2vec = {line.split()[0]: np.array(map(float, line.split()[1:]))
               for line in lines}
    
f = open(fname)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
print ('Word2Vec: %s' % len(word2vec))

3. Implement an embedding vector


In [ ]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(word2vec.itervalues().next())
    
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.itervalues().next())
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

4. Create model structure (SVM/ KNN)


In [ ]:
cntvect = feature_extraction.text.CountVectorizer(stop_words='english')                                                    
trainX, valX, trainY, valY = train_test_split(texts, labels, test_size=VALIDATION_SPLIT)

In [ ]:
svc_glove27B = Pipeline([("count_vectorizer", MeanEmbeddingVectorizer(embeddings_index)), ("linear svc", SVC(kernel="linear"))])
knn_glove27B = Pipeline([("count_vectorizer", MeanEmbeddingVectorizer(embeddings_index)), ("KNN", neighbors.KNeighborsClassifier())])
svc_glove27B_tfidf = Pipeline([("tfidf_vectorizer", TfidfEmbeddingVectorizer(embeddings_index)), ("linear svc", SVC(kernel="linear"))])
knn_glove27B_tfidf = Pipeline([("tfidf_vectorizer", TfidfEmbeddingVectorizer(embeddings_index)), ("KNN", neighbors.KNeighborsClassifier())])

In [ ]:
all_models = [
    ("SVM", svc_glove27B),
    ("KNN", knn_glove27B),
    ("SVM-TFIDF", svc_glove27B_tfidf),
    ("KNN-TFIDF", knn_glove27B_tfidf),  
]
scores = sorted([(name, cross_val_score(model, trainX, trainY, cv=5).mean()) 
                 for name, model in all_models], 
                key=lambda (_, x): -x)

print (scores)
#print tabulate(scores, floatfmt=".4f.79055799899004708), ('KNN'", headers=("model", 'score'))

In [ ]:
def benchmark(model, X, y):
    skf = KFold(len(X), n_folds=5, shuffle=True, random_state= 1337)
    scores = []
    for train, test in skf:
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        scores.append(accuracy_score(model.fit(X_train, y_train).predict(X_test), y_test))
    return np.mean(scores)

#benchmark(model, texts, labels)

In [ ]:
table = []
start = time.time()
for name, model in all_models:
    table.append({'model': name, 
                    'accuracy': benchmark(model, texts, labels)})
print ("Total run time: ", time.time()-start)
df = pd.DataFrame(table)
print (df)

In [ ]:


In [ ]:

classes: 5

accuracy model 0 0.765992 SVM 1 0.752564 KNN 2 0.768358 SVM-TFIDF 3 0.758482 KNN-TFIDF

classes: 10

accuracy model 0 0.596764 SVM 1 0.622618 KNN 2 0.603283 SVM-TFIDF 3 0.624864 KNN-TFIDF

classes: 20

Total run time: 918.678102016 accuracy model 0 0.467512 SVM 1 0.467318 KNN 2 0.471844 SVM-TFIDF 3 0.475389 KNN-TFIDF

classes: 30

Total run time: 1777.46248698 accuracy model 0 0.394013 SVM 1 0.429211 KNN 2 0.403931 SVM-TFIDF 3 0.428364 KNN-TFIDF

classes: 40

Total run time: 12126.748764 accuracy model 0 0.471411 SVM 1 0.483445 KNN 2 0.476590 SVM-TFIDF 3 0.483315 KNN-TFIDF

classes: 60

Total run time: 4297.12801099 accuracy model 0 0.336113 SVM 1 0.336112 KNN 2 0.340016 SVM-TFIDF 3 0.335994 KNN-TFIDF

classes: 120

Total run time: 42448.8118892 accuracy model 0 0.294859 SVM 1 0.290941 KNN 2 0.298249 SVM-TFIDF 3 0.292060 KNN-TFIDF

classes: 180

classes: 280


In [ ]: