In [11]:
from __future__ import print_function
import os
import time
import sys
#from tabulate import tabulate
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from sklearn import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score, train_test_split, KFold
#from gensim.models.word2vec import Word2Vec
np.random.seed(1337)
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1
DROP_OUT = 0.3
Nb_EPOCH = 10
BATCH_SIZE = 10
Classes = 20
BASE_DIR = '.'
GLOVE_DIR = BASE_DIR + '/glove.twitter.27B/'
FILENAME = 'glove.twitter.27B.' + str(EMBEDDING_DIM) + 'd.txt'
TEXT_DATA_DIR = BASE_DIR + '/20_newsgroups_'+ str(Classes)
In [12]:
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
path = os.path.join(TEXT_DATA_DIR, name)
if os.path.isdir(path):
label_id = len(labels_index)
labels_index[name] = label_id
for fname in sorted(os.listdir(path)):
if fname.isdigit():
fpath = os.path.join(path, fname)
if sys.version_info < (3,):
f = open(fpath)
else:
f = open(fpath, encoding='latin-1')
texts.append(f.read())
f.close()
labels.append(label_id)
print('Found %s texts.' % len(texts))
texts, labels = np.array(texts), np.array(labels)
print (texts.shape, labels.shape)
In [13]:
embeddings_index = {}
# fname = os.path.join(GLOVE_DIR, 'glove.twitter.27B.' + str(EMBEDDING_DIM) + 'd.txt')
fname = os.path.join(GLOVE_DIR, FILENAME)
with open(fname, "rb") as lines:
word2vec = {line.split()[0]: np.array(map(float, line.split()[1:]))
for line in lines}
f = open(fname)
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))
print ('Word2Vec: %s' % len(word2vec))
In [14]:
class MeanEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
self.dim = len(word2vec.itervalues().next())
def fit(self, X, y):
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
or [np.zeros(self.dim)], axis=0)
for words in X
])
class TfidfEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
self.word2weight = None
self.dim = len(word2vec.itervalues().next())
def fit(self, X, y):
tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(X)
# if a word was never seen - it must be at least as infrequent
# as any of the known words - so the default idf is the max of
# known idf's
max_idf = max(tfidf.idf_)
self.word2weight = defaultdict(
lambda: max_idf,
[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] * self.word2weight[w]
for w in words if w in self.word2vec] or
[np.zeros(self.dim)], axis=0)
for words in X
])
In [15]:
cntvect = feature_extraction.text.CountVectorizer(stop_words='english')
trainX, valX, trainY, valY = train_test_split(texts, labels, test_size=VALIDATION_SPLIT)
In [16]:
svc_glove27B = Pipeline([("count_vectorizer", MeanEmbeddingVectorizer(embeddings_index)), ("linear svc", SVC(kernel="linear"))])
knn_glove27B = Pipeline([("count_vectorizer", MeanEmbeddingVectorizer(embeddings_index)), ("KNN", neighbors.KNeighborsClassifier())])
svc_glove27B_tfidf = Pipeline([("tfidf_vectorizer", TfidfEmbeddingVectorizer(embeddings_index)), ("linear svc", SVC(kernel="linear"))])
knn_glove27B_tfidf = Pipeline([("tfidf_vectorizer", TfidfEmbeddingVectorizer(embeddings_index)), ("KNN", neighbors.KNeighborsClassifier())])
In [17]:
all_models = [
("SVM", svc_glove27B),
("KNN", knn_glove27B),
("SVM-TFIDF", svc_glove27B_tfidf),
("KNN-TFIDF", knn_glove27B_tfidf),
]
scores = sorted([(name, cross_val_score(model, trainX, trainY, cv=5).mean())
for name, model in all_models],
key=lambda (_, x): -x)
print (scores)
#print tabulate(scores, floatfmt=".4f.79055799899004708), ('KNN'", headers=("model", 'score'))
In [18]:
def benchmark(model, X, y):
skf = KFold(len(X), n_folds=5, shuffle=True, random_state= 1337)
scores = []
for train, test in skf:
X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]
scores.append(accuracy_score(model.fit(X_train, y_train).predict(X_test), y_test))
return np.mean(scores)
#benchmark(model, texts, labels)
In [19]:
table = []
for name, model in all_models:
table.append({'model': name,
'accuracy': benchmark(model, texts, labels)})
df = pd.DataFrame(table)
print (df)
In [ ]:
# classes: 5
# accuracy model
# 0 0.562331 SVM
# 1 0.534512 KNN
# 2 0.570139 SVM-TFIDF
# 3 0.533509 KNN-TFIDF
# classes: 10
# accuracy model
# 0 0.416827 SVM
# 1 0.417126 KNN
# 2 0.424729 SVM-TFIDF
# 3 0.422528 KNN-TFIDF
# classes: 15
# accuracy model
# 0 0.301223 SVM
# 1 0.304939 KNN
# 2 0.305005 SVM-TFIDF
# 3 0.313923 KNN-TFIDF
# classes: 20
# accuracy model
# 0 0.264689 SVM
# 1 0.257188 KNN
# 2 0.271291 SVM-TFIDF
# 3 0.266340 KNN-TFIDF
In [ ]: