In [1]:
from __future__ import print_function
import os
import time
import sys
#from tabulate import tabulate
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from sklearn import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score, train_test_split, KFold
#from gensim.models.word2vec import Word2Vec
np.random.seed(1337)
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1
DROP_OUT = 0.3
Nb_EPOCH = 30
BATCH_SIZE = 50
Classes = 180
GLOVE_DIR = './glove.6B/'
FILENAME = 'glove.6B.' + str(EMBEDDING_DIM) + 'd.txt'
TEXT_DATA_DIR = './yahoo_' + str(Classes)
In [2]:
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
path = os.path.join(TEXT_DATA_DIR, name)
if os.path.isdir(path):
label_id = len(labels_index)
labels_index[name] = label_id
for fname in sorted(os.listdir(path)):
if fname.isdigit():
fpath = os.path.join(path, fname)
if sys.version_info < (3,):
f = open(fpath)
else:
f = open(fpath, encoding='latin-1')
texts.append(f.read())
f.close()
labels.append(label_id)
print('Found %s texts.' % len(texts))
texts, labels = np.array(texts), np.array(labels)
print (texts.shape, labels.shape)
In [ ]:
embeddings_index = {}
# fname = os.path.join(GLOVE_DIR, 'glove.twitter.27B.' + str(EMBEDDING_DIM) + 'd.txt')
fname = os.path.join(GLOVE_DIR, FILENAME)
with open(fname, "rb") as lines:
word2vec = {line.split()[0]: np.array(map(float, line.split()[1:]))
for line in lines}
f = open(fname)
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))
print ('Word2Vec: %s' % len(word2vec))
In [ ]:
class MeanEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
self.dim = len(word2vec.itervalues().next())
def fit(self, X, y):
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
or [np.zeros(self.dim)], axis=0)
for words in X
])
class TfidfEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
self.word2weight = None
self.dim = len(word2vec.itervalues().next())
def fit(self, X, y):
tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(X)
# if a word was never seen - it must be at least as infrequent
# as any of the known words - so the default idf is the max of
# known idf's
max_idf = max(tfidf.idf_)
self.word2weight = defaultdict(
lambda: max_idf,
[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] * self.word2weight[w]
for w in words if w in self.word2vec] or
[np.zeros(self.dim)], axis=0)
for words in X
])
In [ ]:
cntvect = feature_extraction.text.CountVectorizer(stop_words='english')
trainX, valX, trainY, valY = train_test_split(texts, labels, test_size=VALIDATION_SPLIT)
In [ ]:
svc_glove27B = Pipeline([("count_vectorizer", MeanEmbeddingVectorizer(embeddings_index)), ("linear svc", SVC(kernel="linear"))])
knn_glove27B = Pipeline([("count_vectorizer", MeanEmbeddingVectorizer(embeddings_index)), ("KNN", neighbors.KNeighborsClassifier())])
svc_glove27B_tfidf = Pipeline([("tfidf_vectorizer", TfidfEmbeddingVectorizer(embeddings_index)), ("linear svc", SVC(kernel="linear"))])
knn_glove27B_tfidf = Pipeline([("tfidf_vectorizer", TfidfEmbeddingVectorizer(embeddings_index)), ("KNN", neighbors.KNeighborsClassifier())])
In [ ]:
all_models = [
("SVM", svc_glove27B),
("KNN", knn_glove27B),
("SVM-TFIDF", svc_glove27B_tfidf),
("KNN-TFIDF", knn_glove27B_tfidf),
]
scores = sorted([(name, cross_val_score(model, trainX, trainY, cv=5).mean())
for name, model in all_models],
key=lambda (_, x): -x)
print (scores)
#print tabulate(scores, floatfmt=".4f.79055799899004708), ('KNN'", headers=("model", 'score'))
In [ ]:
def benchmark(model, X, y):
skf = KFold(len(X), n_folds=5, shuffle=True, random_state= 1337)
scores = []
for train, test in skf:
X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]
scores.append(accuracy_score(model.fit(X_train, y_train).predict(X_test), y_test))
return np.mean(scores)
#benchmark(model, texts, labels)
In [ ]:
table = []
start = time.time()
for name, model in all_models:
table.append({'model': name,
'accuracy': benchmark(model, texts, labels)})
print ("Total run time: ", time.time()-start)
df = pd.DataFrame(table)
print (df)
In [ ]:
In [ ]:
accuracy model 0 0.765992 SVM 1 0.752564 KNN 2 0.768358 SVM-TFIDF 3 0.758482 KNN-TFIDF
accuracy model 0 0.596764 SVM 1 0.622618 KNN 2 0.603283 SVM-TFIDF 3 0.624864 KNN-TFIDF
Total run time: 918.678102016 accuracy model 0 0.467512 SVM 1 0.467318 KNN 2 0.471844 SVM-TFIDF 3 0.475389 KNN-TFIDF
Total run time: 1777.46248698 accuracy model 0 0.394013 SVM 1 0.429211 KNN 2 0.403931 SVM-TFIDF 3 0.428364 KNN-TFIDF
Total run time: 12126.748764 accuracy model 0 0.471411 SVM 1 0.483445 KNN 2 0.476590 SVM-TFIDF 3 0.483315 KNN-TFIDF
Total run time: 4297.12801099 accuracy model 0 0.336113 SVM 1 0.336112 KNN 2 0.340016 SVM-TFIDF 3 0.335994 KNN-TFIDF
Total run time: 42448.8118892 accuracy model 0 0.294859 SVM 1 0.290941 KNN 2 0.298249 SVM-TFIDF 3 0.292060 KNN-TFIDF
In [ ]: