The aim of this experiment is to compare the Deep Learning model with other Conventional ML approaches to the same problem of clickbait detection.
In [19]:
import sys
import string
import re
import numpy as np
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from keras.models import load_model
from keras.preprocessing import sequence
In [20]:
genuine = open("../data/genuine.preprocessed.txt").read().split("\n")
clickbait = open("../data/clickbait.preprocessed.txt").read().split("\n")
print "Clickbait: "
for each in clickbait[:5]:
print each
print "-" * 50
print "Genuine: "
for each in genuine[:5]:
print each
data = clickbait + genuine
labels = len(genuine) * [0] + len(clickbait) * [1]
In [21]:
clickbait_valid = open("../data/clickbait.valid.txt").read().split("\n")
genuine_valid = open("../data/genuine.valid.txt").read().split("\n")
print "Clickbait: "
for each in clickbait_valid[:5]:
print each
print "-" * 50
print "Genuine: "
for each in genuine_valid[:5]:
print each
valid_data = clickbait_valid + genuine_valid
vocabulary = open("../data/vocabulary.txt").read().split("\n")
inverse_vocabulary = dict((word, i) for i, word in enumerate(vocabulary))
valid_data = [" ".join([w if w in vocabulary else "<UNK>" for w in sent.split()]) for sent in valid_data]
valid_labels = len(clickbait_valid) * [1] + len(genuine_valid) * [0]
In [22]:
svm_clf = Pipeline([("vect", CountVectorizer()),
("tfidf", TfidfTransformer()),
("clf", SVC())])
In [23]:
svm_clf.fit(data, labels);
In [24]:
UNK = "<UNK>"
PAD = "<PAD>"
MATCH_MULTIPLE_SPACES = re.compile("\ {2,}")
SEQUENCE_LENGTH = 20
def words_to_indices(words):
return [inverse_vocabulary.get(word, inverse_vocabulary[UNK]) for word in words]
def clean(text):
for punctuation in string.punctuation:
text = text.replace(punctuation, " " + punctuation + " ")
for i in range(10):
text = text.replace(str(i), " " + str(i) + " ")
text = MATCH_MULTIPLE_SPACES.sub(" ", text)
return text
model = load_model("../models/detector.h5")
In [25]:
inputs = sequence.pad_sequences([words_to_indices(clean(sent.lower()).split()) for sent in valid_data], maxlen=SEQUENCE_LENGTH)
predictions = model.predict(inputs)
predictions = predictions.flatten() > .5
In [26]:
print ("SVM")
print (metrics.classification_report(valid_labels, svm_clf.predict(valid_data)))
print "-" * 50
print ("Convolutional Neural Network")
print (metrics.classification_report(valid_labels, predictions))
In [ ]: