Finetuning on Reddit data


In [1]:
import sys, os
sys.path.insert(0, os.path.join(os.path.abspath(os.pardir)))

import numpy as np
from keras.preprocessing import sequence
from sklearn.cross_validation import train_test_split
from src.models.convnets import ConvolutionalNet
from src.preprocessors.preprocess_text import clean, mark_unknown_words
from src.train import words_to_indices, SEQUENCE_LENGTH, EMBEDDING_DIMENSION, MODEL_FILE


Using TensorFlow backend.

In [2]:
vocabulary = open("../data/vocabulary.txt").read().split("\n")
inverse_vocabulary = dict((word, i) for i, word in enumerate(vocabulary))

clickbait = open("../data/reddit/clickbait-reddit.txt").read() + "\n"
clickbait += open("../data/reddit/clickbait-top-reddit.txt").read()
clickbait = clean(clickbait)
clickbait = clickbait.split("\n")
clickbait = list(set(clickbait))
clickbait = [mark_unknown_words(vocabulary, title) for title in clickbait]

print "Clickbait"
print "-" * 50
for i, each in enumerate(clickbait[:5]):
    print "{0}. {1}".format(i+1, each)
print "-" * 50

genuine = open("../data/reddit/genuine-reddit.txt").read() + "\n"
genuine += open("../data/reddit/news-reddit.txt").read()
genuine = clean(genuine)
genuine = genuine.split("\n")
genuine = list(set(genuine))
genuine = [mark_unknown_words(vocabulary, title) for title in genuine]

print "Genuine"
print "-" * 50
for i, each in enumerate(genuine[:5]):
    print "{0}. {1}".format(i+1, each)
print "-" * 50


Clickbait
--------------------------------------------------
1. mom <UNK> a tough <UNK> after she starts <UNK> her daughter in the car
2. if you see a $ 1 0 0 bill on your car , be <UNK> of this scam
3. <UNK> an iphone 7 with an iphone <UNK> case
4. she knows i lied
5. bernie sanders could replace president trump with little known <UNK>
--------------------------------------------------
Genuine
--------------------------------------------------
1. <UNK> says <UNK> my great grandparents were denied <UNK> in australia & amp ; were murdered in <UNK> <UNK> - the jew terrorists who were <UNK> ' about in <UNK> <UNK> as the <UNK> twin <UNK> came down on 9 1 1 have had <UNK> <UNK> <UNK> in <UNK> since the day of the outrage - <UNK> <UNK> / steven <UNK>
2. despite what you might think , <UNK> are not stupid
3. <UNK> challenge : the real story of a <UNK> military exercise and its legacy
4. death toll now 7 5 in <UNK> <UNK> , brazil as <UNK> chaos <UNK>
5. <UNK> bans the daily mail as a source for being ' <UNK> '
--------------------------------------------------

In [3]:
C = sequence.pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in clickbait], maxlen=SEQUENCE_LENGTH)
G = sequence.pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in genuine], maxlen=SEQUENCE_LENGTH)

X = np.concatenate([C, G], axis=0)
y = np.array([[1] * C.shape[0] + [0] * G.shape[0]], dtype=np.int32).T
p = np.random.permutation(y.shape[0])
X = X[p]
y = y[p]

X_train, X_test, y_train, y_test =  train_test_split(X, y, stratify=y)

In [4]:
params = dict(vocabulary_size=len(vocabulary), embedding_dimension=EMBEDDING_DIMENSION, input_length=SEQUENCE_LENGTH)
model = ConvolutionalNet(**params)

In [5]:
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot


SVG(model_to_dot(model).create(prog="dot", format="svg"))


Out[5]:
G 140099902367696 embedding_input_1: InputLayer 140098849803920 embedding_1: Embedding 140099902367696->140098849803920 140099902366992 convolution1d_1: Convolution1D 140098849803920->140099902366992 140098848505552 batchnormalization_1: BatchNormalization 140099902366992->140098848505552 140098848276304 activation_1: Activation 140098848505552->140098848276304 140098847862736 convolution1d_2: Convolution1D 140098848276304->140098847862736 140098847471504 batchnormalization_2: BatchNormalization 140098847862736->140098847471504 140098846703248 activation_2: Activation 140098847471504->140098846703248 140098846769104 convolution1d_3: Convolution1D 140098846703248->140098846769104 140098846090384 batchnormalization_3: BatchNormalization 140098846769104->140098846090384 140098845989008 activation_3: Activation 140098846090384->140098845989008 140098845934928 maxpooling1d_1: MaxPooling1D 140098845989008->140098845934928 140098845793552 flatten_1: Flatten 140098845934928->140098845793552 140098845133776 dense_1: Dense 140098845793552->140098845133776 140098844866896 batchnormalization_4: BatchNormalization 140098845133776->140098844866896 140098844660240 activation_4: Activation 140098844866896->140098844660240

In [22]:
from keras.layers import Convolution1D, Dense
from keras.regularizers import activity_l1l2
from keras.optimizers import SGD, Adam

model.load_weights("../models/detector.h5")

for layer in model.layers:
    layer.trainable = False

for layer in model.layers[1:]:
    if type(layer) == Dense or type(layer) == Convolution1D:
        layer.W_regularizer=None
        layer.activity_regularizer=activity_l1l2(0.05)
    layer.trainable = True



model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])

In [23]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, nb_epoch=5, shuffle=True)


Train on 2541 samples, validate on 848 samples
Epoch 1/5
2541/2541 [==============================] - 4s - loss: 0.5413 - acc: 0.7824 - val_loss: 0.5357 - val_acc: 0.8066
Epoch 2/5
2541/2541 [==============================] - 1s - loss: 0.3778 - acc: 0.8512 - val_loss: 0.4626 - val_acc: 0.8160
Epoch 3/5
2541/2541 [==============================] - 1s - loss: 0.3174 - acc: 0.8835 - val_loss: 0.4603 - val_acc: 0.8031
Epoch 4/5
2541/2541 [==============================] - 1s - loss: 0.2693 - acc: 0.9071 - val_loss: 0.4570 - val_acc: 0.8208
Epoch 5/5
2541/2541 [==============================] - 1s - loss: 0.2418 - acc: 0.9260 - val_loss: 0.4623 - val_acc: 0.8125
Out[23]:
<keras.callbacks.History at 0x7f6b1d29bf10>

In [24]:
model.save_weights("../models/detector.finetuned.h5")