In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import logging, csv, string, sys
import random
#need to use ccri fork of gensim
import gensim.models.word2vec as w2v
import gensim.models.doc2vec as d2v
import numpy as np
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import os
#os.environ['THEANO_FLAGS'] = 'floatX=float32, device=cpu'
os.environ['THEANO_FLAGS'] = 'cuda.root=/usr/local/cuda-6.5,device=gpu,optimizer_including=cudnn,floatX=float32'
import theano.tensor as T
import dcnn
import sklearn.metrics as sklm
from tsne import bh_sne
Sentences had been pre-parsed into dependency trees w/ parts of speech for use with another type of neural net
http://nlp.stanford.edu/~socherr/SocherKarpathyLeManningNg_TACL2013.pdf
This cnn ignores any of that pre-parsed sentence structure.
In [2]:
baseDir = "sentences_by_cat/"
# check out service/scripts/depParsing.scala to see how to make the files
files = os.listdir(baseDir)
files = [x for x in files if "parsed" in x]
def lineToList(line):
if line != "":
try:
split = line.strip(")").split("(")
tag = split[0]
tupParts = split[1].split()
parent = tupParts[0].split('-')[-1].strip('\,')
childParts = tupParts[1].split("-")
idx = childParts[-1]
word = "-".encode("utf8").join(childParts[:-1])
return [word, int(idx), int(parent), tag]
except:
print "failed on line " + line
def createTupleForFile(fn):
f = open(baseDir + fn, "r")
bigassString = f.read()
f.close()
return (fn, [map(lineToList, s.split("\n")) for s in bigassString.split("\n\n")])
In [3]:
grouped_by_file = dict([createTupleForFile(x) for x in files])
#errored on last line
grouped_by_file = dict([(x, y[:-1]) for x,y in grouped_by_file.iteritems()])
In [4]:
just_word_sentences = [map(lambda x: x[0], sentence) for f in grouped_by_file.values() for sentence in f]
In [5]:
tag_to_num = dict([(y,x) for x,y in enumerate(grouped_by_file.keys())])
In [6]:
labeled_sentences = [(tag_to_num.get(tag), sentence) for tag,lst in grouped_by_file.items() for sentence in lst]
Replace any numbers and punctuation with place-holder words.
In [7]:
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
def replaceWord(word):
if all(char in string.punctuation for char in word):
return "*PUNCT*".encode("utf8")
elif is_number(word):
return "*NUM*".encode("utf8")
else:
return word
def replaceStrangeWords(listOfWords):
return map(replaceWord, listOfWords)
Start word vectors with pre trained word2vec vectors. Not totally necessary but speeds things up.
In [8]:
sentences = map(replaceStrangeWords, just_word_sentences)
seededmodel = w2v.Word2Vec(initial = '/opt/devel/src/word2vec/vectors50.txt', size = 50, min_count=1)
seededmodel.build_vocab(sentences)
seededmodel.train(sentences)
seededmodel.train(sentences)
seededmodel.train(sentences)
Out[8]:
In [9]:
def getIndexForWord(word):
if word in seededmodel.vocab:
return seededmodel.vocab[word].index
else:
print 'couldnt find: ' + word
return -1
lab_sen = [([getIndexForWord(replaceWord(tup[0])) for tup in sent if tup is not None], dcnn.one_of_n(t, 5)) for t,sent in labeled_sentences]
In [10]:
random.seed(5)
random.shuffle(lab_sen)
trainFrac = .7
validFrac = .1
t_frac = int(len(lab_sen) * trainFrac)
v_frac = int(len(lab_sen) * (trainFrac + validFrac))
train = lab_sen[:t_frac]
valid = lab_sen[t_frac:v_frac]
print seededmodel.syn0.shape
voc_with_zero_vec = np.vstack((seededmodel.syn0, np.zeros(50)))
print voc_with_zero_vec.shape
Training on 70% of the data. validating on 10% and saving the other 20% to test on.
In [11]:
nn = dcnn.DCNN(voc_with_zero_vec, [0,0,1], [5,3,5], [5,3,2], train, valid, k_top=3, hidden_dim=100)
In [12]:
# prev = nn.theta.get_value()
# prevh = nn.hidden_layer.get_value()
nn.train_cross_ent(20,5)
# print prev - nn.theta.get_value()
# print prevh - nn.hidden_layer.get_value()
Train for a few epochs using the bootstrap-soft objective function after primary training completed.
In [13]:
nn.train_multi_b_soft(.9, 3, 5)
In [14]:
testX = map(lambda x: x[0], lab_sen[v_frac:])
testY = map(lambda x: np.argmax(x[1]), lab_sen[v_frac:])
In [15]:
preds = np.array(nn.predict(testX))
Look at the F1 scores and accuracy as well as the confusion matrix.
In [16]:
singleX = map(np.argmax, preds)
print len(preds)
print sklm.f1_score(testY, singleX, average='macro')
print sklm.f1_score(testY, singleX, average='micro')
print sklm.accuracy_score(testY, singleX)
print tag_to_num
conf = sklm.confusion_matrix(testY, singleX)
print conf
print np.sum(conf, axis=1) / (np.sum(conf) * 1.0)
In [17]:
allX = map(lambda x: x[0], lab_sen)
allY = map(lambda x: np.argmax(x[1]), lab_sen)
allpreds = np.array(nn.predict(allX))
allembeds = np.array(nn.embed(allX))
In [34]:
all_single_X = map(np.argmax, allpreds)
print len(allpreds)
print sklm.f1_score(allY, all_single_X, average='macro')
print sklm.f1_score(allY, all_single_X, average='micro')
print sklm.accuracy_score(allY, all_single_X)
print tag_to_num
conf_all = sklm.confusion_matrix(allY, all_single_X)
print conf_all
print np.sum(conf_all, axis=1) / (np.sum(conf_all) * 1.0)
In [19]:
reduced = bh_sne(np.array(allembeds, dtype='float64'))
Look at the embeddings from the fully connected layer before the softmax.
First colored by their ground true classfication.
In [42]:
plt.figure(figsize=(15,12))
plt.scatter(reduced[:, 0], reduced[:, 1], c=allY, cmap=plt.cm.rainbow, alpha=.75)
plt.colorbar()
plt.title("True Category")
plt.savefig("true_embed.png")
Then colored by the predicted classification.
In [41]:
plt.figure(figsize=(15,12))
plt.scatter(reduced[:, 0], reduced[:, 1], c=map(np.argmax, allpreds), cmap=plt.cm.rainbow, alpha=.75)
plt.colorbar()
plt.title("Predicted Category")
plt.savefig("predicted_embed.png")
In [26]:
docsentences = [(d2v.LabeledSentence(replaceStrangeWords([x[0] for x in s[1]]),['id-'+str(i)]), s[0]) for i,s in enumerate(labeled_sentences) ]
In [27]:
print docsentences[0][0]
d2vmodel = d2v.Doc2Vec(initial = '/opt/devel/src/word2vec/vectors50.txt', size = 50, min_count=1)
d2vmodel.build_vocab([x[0] for x in docsentences])
d2vmodel.train([x[0] for x in docsentences])
random.shuffle(docsentences)
d2vmodel.train([x[0] for x in docsentences])
random.shuffle(docsentences)
d2vmodel.train([x[0] for x in docsentences])
Out[27]:
In [28]:
vecs = [(d2vmodel[x.labels[0]], y)for x,y in docsentences]
random.seed(5)
random.shuffle(vecs)
lr_train = vecs[:t_frac]
In [32]:
from sklearn import linear_model
logreg = linear_model.LogisticRegression(C=1e5)
X, y = zip(*lr_train)
logreg.fit(X,y)
lr_testX, lr_testY = zip(*vecs)
lr_preds = logreg.predict(lr_testX)
In [33]:
print len(lr_preds)
print sklm.f1_score(lr_testY, lr_preds, average='macro')
print sklm.f1_score(lr_testY, lr_preds, average='micro')
print sklm.accuracy_score(lr_testY, lr_preds)
print tag_to_num
print sklm.confusion_matrix(lr_testY, lr_preds)
In [35]:
d2v_reduced = bh_sne(np.array(map(lambda x: x[0], vecs), dtype='float64'))
In [39]:
plt.figure(figsize=(15,12))
plt.scatter(d2v_reduced[:, 0], d2v_reduced[:, 1], c=lr_testY, cmap=plt.cm.rainbow, alpha=.75)
plt.colorbar()
plt.title("True Category")
plt.savefig("true_embed_d2v.png")
In [40]:
plt.figure(figsize=(15,12))
plt.scatter(d2v_reduced[:, 0], d2v_reduced[:, 1], c=lr_preds, cmap=plt.cm.rainbow, alpha=.75)
plt.colorbar()
plt.title("Predicted Category")
plt.savefig("predicted_embed_d2v.png")