In [ ]:
import tensorflow as tf
import numpy as np
import pandas as pd
import spacy
In [ ]:
filePath = "data/quora_duplicate_questions.tsv"
df = pd.read_csv(filePath,delimiter="\t")
df.question1 = df.question1.fillna("")
df.question1 = df.question1.apply(str.lower)
df.question2 = df.question2.fillna("")
df.question2 = df.question2.apply(str.lower)
Finding unique words in dataset to create vocabulary
In [ ]:
def tokenize(s,nlp):
doc = nlp(s)
tokSen = []
for word in doc:
tokSen.append(word.text)
return tokSen
nlp = spacy.load('en')
uniqueQuestions = df.question1.unique()
tokenizedQns = [tokenize(unicode(sentence,'utf8'),nlp) for sentence in uniqueQuestions]
words = [word for tokWords in tokenizedQns for word in tokWords]
words2 = df.question2.unique()
words2 = [tokenize(unicode(sentence,'utf8'),nlp) for sentence in words2]
words2 = [word for tokWords in words2 for word in tokWords]
words.extend(words2)
Adding PAD as filler for normalizing sentence length and UNK for unkown tokens
In [ ]:
words = set(words)
vocabulary = dict(zip(words,range(2,len(words)+2)))
vocabulary['PAD'] = 0
vocabulary['UNK'] = 1
print("Vocabulary Size including PAD and UNK: ",len(vocabulary))
Each question represented as list of index in the vocabulary
In [ ]:
def loadWordVectors(filePath,vocab):
txt = open('data/wiki.en.vec')
wordVecs = np.zeros((len(vocab),300),dtype=float)
for line in txt:
splitData = line.split(" ")
word = splitData[0]
word = unicode(word,'utf8')
if(word not in vocab):
continue
vector = splitData[1:len(splitData)-1]
wordVecs[vocab[word]] = np.array(vector,dtype=float)
return wordVecs
wordVecSize = 300
wordVecs = loadWordVectors('wiki/wiki.en.vec',vocabulary)
In [ ]:
idx = 0
for w in wordVecs:
if(w is None):
count += 1
wordVecs[idx] = 2 * np.random.random_sample(wordVecSize) - 1
In [ ]:
def tokenizeAndIndex(sentence):
words = tokenize(unicode(sentence,'utf8'),nlp)
retVal = [vocabulary[word] if word in vocabulary else vocabulary['UNK'] for word in words]
return retVal
df['Q1Indexed'] = df.question1.apply(tokenizeAndIndex)
df['Q2Indexed'] = df.question2.apply(tokenizeAndIndex)
Threshold questions with total words <= 50
In [ ]:
seqLength = 50
df = df[df.Q1Indexed.apply(len) <= seqLength]
df = df[df.Q2Indexed.apply(len) <= seqLength]
def normalizeSequenceLength(sequence):
if(len(sequence) < seqLength):
padding = [vocabulary['PAD'] for i in range(seqLength - len(sequence))]
sequence.extend(padding)
return sequence
df.Q1Indexed = df.Q1Indexed.apply(normalizeSequenceLength)
df.Q2Indexed = df.Q2Indexed.apply(normalizeSequenceLength)
In [ ]:
positiveSamples = df[df.is_duplicate==1]
negativeSamples = df[df.is_duplicate==0]
#Testing data
positiveTest = positiveSamples.sample(frac=0.3)
negativeTest = negativeSamples.sample(frac=0.3)
testData = positiveTest.append(negativeTest)
print("Number of test samples: {0}".format(len(testData)))
#Training data
trainData = df[df.id.isin(testData.id) == False]
print("Number of train samples: {0}".format(len(trainData)))
positiveVal = positiveTest.sample(frac=0.5)
negativeVal = negativeTest.sample(frac=0.5)
valData = positiveVal.append(negativeVal)
positiveTest = positiveTest[positiveTest.id.isin(positiveVal.id) == False]
negativeTest = negativeTest[negativeTest.id.isin(negativeVal.id) == False]
testData = positiveTest.append(negativeTest)
totalLen = float(len(df))
print("Split ratio: {}:{}:{}".format(len(trainData) / totalLen, len(valData) / totalLen, len(testData) / totalLen))
print("Total Samples: {}:{}:{}".format(len(trainData), len(valData), len(testData)))
#print(float(len(valData)) / len(df))
Saving processed data to file
In [ ]:
df.to_pickle('data/ProcessedData.pkl')
trainData.to_pickle("data/TrainData.pkl")
testData.to_pickle("data/TestData.pkl")
valData.to_pickle("data/ValData.pkl")
np.save('data/wordVecs.npy',wordVecs)
Loading processed data from file
In [ ]:
df = pd.read_pickle('data/ProcessedData.pkl')
trainData = pd.read_pickle('data/TrainData.pkl')
testData = pd.read_pickle('data/TestData.pkl')
testData['predicted'] = -1
valData = pd.read_pickle('data/ValData.pkl')
wordVecs = np.load('data/wordVecs.npy')
wordVecSize = 300
seqLength = 50
Creating setence embedding
In [ ]:
tf.reset_default_graph()
In [ ]:
wordVecSize = 100
vocab_size = len(wordVecs)
with tf.variable_scope("Words") as scope:
W = tf.Variable(wordVecs,name="W")
#W = tf.Variable(tf.random_uniform([vocab_size, wordVecSize], -1.0, 1.0),name="W")
q1Input = tf.placeholder(tf.int32, [None, seqLength], name="q1Input")
q1Embeddings = tf.nn.embedding_lookup(W, q1Input)
q1Embeddings = tf.reshape(q1Embeddings,[-1,300],name='q1Reshape')
q1Embeddings = tf.layers.dense(inputs=q1Embeddings, units=wordVecSize,name='downsample')
q1Embeddings = tf.nn.dropout(x=q1Embeddings,keep_prob=0.9)
q1Embeddings = tf.reshape(q1Embeddings,[-1,seqLength,wordVecSize])
q1Mask = tf.placeholder(tf.float64, [None, seqLength, 1], name="q1Mask")
q1Embeddings = tf.multiply(q1Embeddings, q1Mask, name='q1Masked')
q1SeqLen = tf.placeholder(tf.int32, [None], name="q1SequenceLength")
scope.reuse_variables()
q2Input = tf.placeholder(tf.int32, [None, seqLength], name="q2Input")
#q2SeqLen = tf.placeholder(tf.int32, [None], name="q2SequenceLength")
q2Embeddings = tf.nn.embedding_lookup(W, q2Input)
q2Embeddings = tf.reshape(q2Embeddings,[-1,300],name='q2Reshape')
q2Embeddings = tf.layers.dense(inputs=q2Embeddings, units=wordVecSize,name='downsample')
q2Embeddings = tf.nn.dropout(x=q2Embeddings,keep_prob=0.9)
q2Embeddings = tf.reshape(q2Embeddings,[-1,seqLength,wordVecSize])
q2Mask = tf.placeholder(tf.float64, [None, seqLength, 1], name="q2Mask")
q2Embeddings = tf.multiply(q2Embeddings, q2Mask, name='q1Masked')
with tf.variable_scope("Sentence") as scope:
# initializer = tf.contrib.layers.xavier_initializer()
cell = tf.contrib.rnn.GRUCell(wordVecSize)
q1Rep,_ = tf.nn.dynamic_rnn(cell,q1Embeddings,dtype=tf.float64,swap_memory=True)
q1Rep = tf.reduce_mean(q1Rep,axis=1)
#idx = tf.argmax(q1Rep,axis=1)
#q1Rep = q1Rep[:,idx,:]
#q1Rep = q1Rep[:,-1,:]
scope.reuse_variables()
q2Rep,_ = tf.nn.dynamic_rnn(cell,q2Embeddings,dtype=tf.float64,swap_memory=True)
q2Rep = tf.reduce_mean(q2Rep,axis=1)
#idx = tf.argmax(q2Rep,axis=1)
#q2Rep = q2Rep[:,idx,:]
#q2Rep = q2Rep[:,-1,:]
sentenceEmbedding = tf.concat([q1Rep,q2Rep],axis=1,name='sentenceEmbedding')
Dense layers and output
In [ ]:
with tf.variable_scope("DenseLayers") as scope:
dense1 = tf.layers.dense(inputs=sentenceEmbedding, units=wordVecSize*2, activation=tf.nn.tanh,name='dense1')
#dropoutD1 = tf.nn.dropout(x=dense1,keep_prob=0.9)
dense2 = tf.layers.dense(inputs=dense1, units=wordVecSize*2, activation=tf.nn.tanh,name='dense2')
#dropoutD2 = tf.nn.dropout(x=dense2,keep_prob=0.9)
#dense3 = tf.layers.dense(inputs=dropoutD2, units=wordVecSize*2, activation=tf.nn.tanh,name='dense3')
#dropoutD3 = tf.nn.dropout(x=dense3,keep_prob=0.7)
logits = tf.layers.dense(inputs=dense2, units=2,name='logits')
with tf.variable_scope("Prediction") as scope:
predictions = tf.argmax(input=tf.nn.softmax(logits=logits,dim=-1,name='softmax'),axis=1,name='output')
Loss and gradient updates
In [ ]:
num_classes = 2
labels = tf.placeholder(tf.int32,[None,num_classes],name='labels')
loss = None
train_op = None
# Calculate loss for both TRAIN and EVAL modes
loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits)
train_op = tf.contrib.layers.optimize_loss(loss=loss,
global_step=tf.contrib.framework.get_global_step(),
learning_rate=0.01,
optimizer="Adam")
correct_prediction = tf.equal(predictions, tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
Prepare variables for training epoch
In [ ]:
session = tf.InteractiveSession()
In [ ]:
fetches = {'eval_op':train_op,'accuracy':accuracy}
print("Starting...")
session.run(tf.global_variables_initializer())
noEpisodes = 10
batchSize = 1000
noEpochs = len(trainData) / batchSize
valLabels = tf.one_hot(valData.is_duplicate.values,on_value=1,
off_value=0,depth=2,axis=-1,name='one_hot_labels')
valLabels = valLabels.eval(session=session)
valQ1Indices = np.array(list(valData.Q1Indexed.values),dtype=np.int32)
valQ1Len = valData.Q1Length.values.astype(np.int32)
valQ1Mask = [np.append(np.ones((revLen,1)),np.zeros((seqLength-revLen,1)),axis=0)
for revLen in valData.Q1Length]
valQ2Indices = np.array(list(valData.Q2Indexed.values),dtype=np.int32)
valQ2Len = valData.Q2Length.values.astype(np.int32)
valQ2Mask = [np.append(np.ones((revLen,1)),np.zeros((seqLength-revLen,1)),axis=0)
for revLen in valData.Q2Length]
testLabels = tf.one_hot(testData.is_duplicate.values,on_value=1,
off_value=0,depth=2,axis=-1,name='one_hot_labels')
testLabels = testLabels.eval(session=session)
testQ1Indices = np.array(list(testData.Q1Indexed.values),dtype=np.int32)
testQ1Len = testData.Q1Length.values.astype(np.int32)
testQ1Mask = [np.append(np.ones((revLen,1)),np.zeros((seqLength-revLen,1)),axis=0)
for revLen in testData.Q1Length]
testQ2Indices = np.array(list(testData.Q2Indexed.values),dtype=np.int32)
testQ2Len = testData.Q2Length.values.astype(np.int32)
testQ2Mask = [np.append(np.ones((revLen,1)),np.zeros((seqLength-revLen,1)),axis=0)
for revLen in testData.Q2Length]
noTestBatches = 100
testSzPerBatch = len(valQ1Indices) / noTestBatches
print("Episode\ttrain loss\tval loss\ttest loss\tval accuracy\ttest accuracy")
for episode in range(noEpisodes):
episodeData = trainData.iloc[np.random.permutation(len(trainData))]
startIdx = 0
episodeLoss = 0
for epoch in range(noEpochs):
batch = episodeData.iloc[startIdx:startIdx+batchSize]
startIdx += batchSize
oneHotLabels = tf.one_hot(batch.is_duplicate.values,
on_value=1,off_value=0,depth=2,axis=-1,name='one_hot_labels')
oneHotLabels = oneHotLabels.eval(session=session)
q1Indices = np.array(list(batch.Q1Indexed.values),dtype=np.int32)
q1Len = batch.Q1Length.values.astype(np.int32)
q1MaskInp = [np.append(np.ones((revLen,1)),np.zeros((seqLength-revLen,1)),axis=0)
for revLen in batch.Q1Length]
q2Indices = np.array(list(batch.Q2Indexed.values),dtype=np.int32)
q2Len = batch.Q2Length.values.astype(np.int32)
q2MaskInp = [np.append(np.ones((revLen,1)),np.zeros((seqLength-revLen,1)),axis=0)
for revLen in batch.Q2Length]
feed_dict = {q1Input:q1Indices,q2Input:q2Indices,labels:oneHotLabels,
q1Mask:q1MaskInp,q2Mask:q2MaskInp}
trainMetrics = session.run(fetches,feed_dict)
episodeLoss += trainMetrics['eval_op']
episodeLoss /= noEpochs
valLoss = 0
valAccuracy = 0
fetches = {'loss':loss, 'accuracy':accuracy}
for subTest in range(noTestBatches):
startIdx = subTest*testSzPerBatch
endIdx = startIdx + testSzPerBatch
if(subTest == noTestBatches-1):
endIdx = len(testQ1Indices)
valFeed = {q1Input:valQ1Indices[startIdx:endIdx],
q2Input:valQ2Indices[startIdx:endIdx],
labels:valLabels[startIdx:endIdx],
q1Mask:valQ1Mask[startIdx:endIdx],
q2Mask:valQ2Mask[startIdx:endIdx]}
valMetrics = session.run(fetches,valFeed)
valLoss += valMetrics['loss']
valAccuracy += valMetrics['accuracy']
testLoss = 0
testAccuracy = 0
fetches = {'loss':loss, 'accuracy':accuracy, 'predictions':predictions}
for subTest in range(noTestBatches):
startIdx = subTest*testSzPerBatch
endIdx = startIdx + testSzPerBatch
if(subTest == noTestBatches-1):
endIdx = len(testQ1Indices)
testFeed = {q1Input:testQ1Indices[startIdx:endIdx],
q2Input:testQ2Indices[startIdx:endIdx],
labels:testLabels[startIdx:endIdx],
q1Mask:testQ1Mask[startIdx:endIdx],
q2Mask:testQ2Mask[startIdx:endIdx]}
testMetrics = session.run(fetches,testFeed)
testLoss += testMetrics['loss']
testAccuracy += testMetrics['accuracy']
testData.loc[testData.id[startIdx:endIdx] ,'predicted'] = testMetrics['predictions']
valLoss = valLoss/float(noTestBatches)
valAccuracy = (100.0 / noTestBatches) * valAccuracy
testLoss = testLoss/float(noTestBatches)
testAccuracy = (100.0 / noTestBatches) * testAccuracy
print("{}\t{}\t{}\t{}\t{}\t{}".format(episode,episodeLoss,valLoss,testLoss,valAccuracy,testAccuracy))
fetches = {'eval_op':train_op,'accuracy':accuracy}
In [ ]:
testData.to_csv('testPredictions_lstm.csv')
In [ ]:
fetches = {'eval_op':train_op,'accuracy':accuracy}
print("Episode\ttrain loss\tval loss\tval accuracy")
for episode in range(noEpisodes):
episodeData = trainData.iloc[np.random.permutation(len(trainData))]
startIdx = 0
episodeLoss = 0
for epoch in range(noEpochs):
batch = episodeData.iloc[startIdx:startIdx+batchSize]
startIdx += batchSize
oneHotLabels = tf.one_hot(batch.is_duplicate.values,
on_value=1,off_value=0,depth=2,axis=-1,name='one_hot_labels')
oneHotLabels = oneHotLabels.eval(session=session)
q1Indices = np.array(list(batch.Q1Indexed.values),dtype=np.int32)
q1Len = batch.Q1Length.values.astype(np.int32)
q2Indices = np.array(list(batch.Q2Indexed.values),dtype=np.int32)
q2Len = batch.Q2Length.values.astype(np.int32)
feed_dict = {q1Input:q1Indices,q1SeqLen:q1Len,q2Input:q2Indices,q2SeqLen:q2Len,labels:oneHotLabels}
trainMetrics = session.run(fetches,feed_dict)
episodeLoss += trainMetrics['eval_op']
episodeLoss /= noEpochs
testLoss = 0
testAccuracy = 0
fetches = {'loss':loss, 'accuracy':accuracy}
for subTest in range(noTestBatches):
startIdx = subTest*testSzPerBatch
endIdx = startIdx + testSzPerBatch
if(subTest == noTestBatches-1):
endIdx = len(testQ1Indices)
testFeed = {q1Input:valQ1Indices[startIdx:endIdx],
q1SeqLen:valQ1Len[startIdx:endIdx],
q2Input:valQ2Indices[startIdx:endIdx],
q2SeqLen:valQ2Len[startIdx:endIdx],
labels:valLabels[startIdx:endIdx]}
testMetrics = session.run(fetches,testFeed)
testLoss += testMetrics['loss']
testAccuracy += testMetrics['accuracy']
testLoss = testLoss/float(noTestBatches)
testAccuracy = (100.0 / noTestBatches) * testAccuracy
print("{}\t{}\t{}\t{}".format(episode+10,episodeLoss,testLoss,testAccuracy))
fetches = {'eval_op':train_op,'accuracy':accuracy}
In [ ]:
Testing restore and predictions
In [ ]:
testLoss = 0
testAccuracy = 0
falsePositives = 0
falseNegatives = 0
truePositives = 0
trueNegatives = 0
fetches = {'loss':loss, 'accuracy':accuracy, 'predictions':correct_prediction}
for subTest in range(noTestBatches):
startIdx = subTest*testSzPerBatch
endIdx = startIdx + testSzPerBatch
if(subTest == noTestBatches-1):
endIdx = len(testQ1Indices)
testFeed = {q1Input:testQ1Indices[startIdx:endIdx],
q1SeqLen:testQ1Len[startIdx:endIdx],
q2Input:testQ2Indices[startIdx:endIdx],
q2SeqLen:testQ2Len[startIdx:endIdx],
labels:testLabels[startIdx:endIdx]}
testMetrics = session.run(fetches,testFeed)
testLoss += testMetrics['loss']
testAccuracy += testMetrics['accuracy']
preds = testMetrics['predictions']
tl = testLabels[startIdx:endIdx]
falsePositives += np.sum(preds[tl[:,0] == 1] == True)
falseNegatives += np.sum(preds[tl[:,1] == 1] == False)
truePositives += np.sum(preds[tl[:,1] == 1] == True)
trueNegatives += np.sum(preds[tl[:,0] == 1] == False)
testLoss = testLoss/float(noTestBatches)
testAccuracy = (100.0 / noTestBatches) * testAccuracy
print("{}\t{}".format(testLoss,testAccuracy))
In [ ]:
testLoss = 0
testAccuracy = 0
falsePositives = 0
falseNegatives = 0
truePositives = 0
trueNegatives = 0
startIdx = 0
endIdx = 30193
fetches = {'loss':loss, 'accuracy':accuracy, 'predictions':predictions}
testFeed = {q1Input:testQ1Indices[startIdx:endIdx],
q1Mask:testQ1Mask[startIdx:endIdx],
q2Input:testQ2Indices[startIdx:endIdx],
q2Mask:testQ2Mask[startIdx:endIdx],
labels:testLabels[startIdx:endIdx]}
testMetrics = session.run(fetches,testFeed)
testLoss = testMetrics['loss']
testAccuracy = testMetrics['accuracy']
preds = testMetrics['predictions']
tl = testLabels[startIdx:endIdx]
falsePositives = np.sum(preds[tl[:,0] == 1] == True)
falseNegatives = np.sum(preds[tl[:,1] == 1] == False)
truePositives = np.sum(preds[tl[:,1] == 1] == True)
trueNegatives = np.sum(preds[tl[:,0] == 1] == False)
testAccuracy = 100.0 * testAccuracy
print("{}\t{}".format(testLoss,testAccuracy))
In [ ]:
precision = 100*float(truePositives) / (truePositives + falsePositives)
recall = 100*float(truePositives) / (truePositives + falseNegatives)
print precision,recall
In [ ]:
print truePositives,trueNegatives,falsePositives,falseNegatives
In [ ]:
print len(testLabels[0:1000] == [1,0])
print len(testLabels[0:1000] != [1,0])
print len(testLabels[0:1000])
#S1: sum of testLabels[:,0] will give number of actual non duplicates
#S2: sum of corrPred[:,0] will give number of predicted non duplicates
#S2 - S1
temp = testLabels[startIdx:endIdx]
temp[temp[:,0] == 1] -
In [ ]:
with tf.Session() as sess:
saver = tf.train.import_meta_graph('/home/ubuntu/QuestionPairs/SumModel/-9.meta')
saver.restore(sess, '/home/ubuntu/QuestionPairs/SumModel/-9')
temp = predictions.eval(session=sess,feed_dict=testFeed1)
In [ ]:
#np.argmax(testLabels[:lTest],axis=1)
actual = np.argmax(testLabels[:lTest],axis=1)
predicted = temp
In [ ]:
y = actual - predicted
print "%age of non duplicates classified as duplicates: ", float(len(y[y==-1])) / float(len(y))
print "%age of duplicates classified as non duplicates: ", float(len(y[y==1])) / float(len(y))