In [18]:
def pretty_print_review_and_label(i):
print(types[i] + "\t:\t" + titles[i] + "...")
g = open('titles.txt', 'r', encoding="utf8") # read the titles
titles = list(map(lambda x:x[:-1].lower(),g.readlines()))
g.close()
g = open('types.txt', 'r', encoding="utf8") # read the types
types = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()
In [19]:
from collections import Counter
import numpy as np
In [20]:
words_counts = Counter()
real_counts = Counter()
fake_counts = Counter()
In [21]:
for i in range(len(types)):
if(types[i] == 'FAKE'):
for word in titles[i].split(" "):
words_counts[word] += 1
fake_counts[word] += 1
else:
for word in titles[i].split(" "):
words_counts[word] += 1
real_counts[word] += 1
In [22]:
#fake_counts.most_common()
#real_counts.most_common()
In [23]:
fake_ratios = Counter()
for term,cnt in list(words_counts.most_common()):
if(cnt > 20):
fake_ratio = fake_counts[term] / float(real_counts[term]+0.1)
fake_ratios[term] = fake_ratio
In [24]:
#print("ratio for 'the' = {}".format(fake_ratios["the"]))
#print("ratio for 'breaking' = {}".format(fake_ratios["breaking"]))
#print("ratio for 'the' = {}".format(fake_ratios["debate"]))
In [25]:
#for word,ratio in fake_ratios.most_common():
#fake_ratios[word] = np.log(ratio)
In [26]:
#%reset
In [27]:
import time
import sys
import numpy as np
class SentimentNetwork:
def __init__(self, titles, labels, hidden_nodes = 10, learning_rate = 0.1):
"""Create a SentimenNetwork
Args:
reviews(list) - List of titles
labels(list) - List of FAKE/REAL labels
hidden_nodes(int) - Number of nodes to create in the hidden layer
learning_rate(float) - Learning rate to use while training
"""
# Assign a seed to our random number generator to ensure we get
# reproducable results during development
np.random.seed(1)
# process the reviews and their associated labels so that everything
# is ready for training
self.pre_process_data(titles, labels)
# Build the network to have the number of hidden nodes and the learning rate that
# were passed into this initializer. Make the same number of input nodes as
# there are vocabulary words and create a single output node.
self.init_network(len(self.vocab), hidden_nodes, 1, learning_rate)
def pre_process_data(self, titles, labels):
# populate vocab with all of the words in the given reviews
vocab = set()
for title in titles:
for word in title.split(" "):
vocab.add(word)
# Convert the vocabulary set to a list so we can access words via indices
self.vocab = list(vocab)
# populate label_vocab with all of the words in the given labels.
label_vocab = set()
for label in labels:
label_vocab.add(label)
# Convert the label vocabulary set to a list so we can access labels via indices
self.label_vocab = list(label_vocab)
# Store the sizes of th vocabularies.
self.vocab_size = len(self.vocab)
self.label_vocab_size = len(self.label_vocab)
# Create a dictionary of words in the vocabulary mapped to index positions
self.word2index = {}
for i, word in enumerate(self.vocab):
self.word2index[word] = i
# Create a dictionary of labels mapped to index positions
self.label2index = {}
for i, label in enumerate(self.label_vocab):
self.label2index[label] = i
def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
# Set number of nodes in input, hidden and output layers.
self.input_nodes = input_nodes
self.hidden_nodes = hidden_nodes
self.output_nodes = output_nodes
# Store the learning rate
self.learning_rate = learning_rate
# Initialize weights
# These are the weights between the input layer and the hidden layer.
self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
# These are the weights between the hidden layer and the output layer.
self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,
(self.hidden_nodes, self.output_nodes))
# The input layer, a two-dimensional matrix with shape 1 x input_nodes
self.layer_0 = np.zeros((1,input_nodes))
def update_input_layer(self, title):
# clear out previous state, reset the layer to be all 0s
self.layer_0 *= 0
for word in title.split(" "):
if(word in self.word2index.keys()):
self.layer_0[0][self.word2index[word]] += 1
def get_target_for_label(self, label):
if(label == 'FAKE'):
return 1
else:
return 0
def sigmoid(self,x):
return 1 / (1 + np.exp(-x))
def sigmoid_output_2_derivative(self,output):
return output * (1 - output)
def train(self, training_titles, training_labels):
# make sure out we have a matching number of titles and labels
assert(len(training_titles) == len(training_labels))
# Keep track of correct predictions to display accuracy during training
correct_so_far = 0
# Remember when we started for printing time statistics
start = time.time()
# loop through all the given titles and run a forward and backward pass,
# updating weights for every item
for i in range(len(training_titles)):
# Get the next title and its correct label
title = training_titles[i]
label = training_labels[i]
### Forward pass ###
# Input Layer
self.update_input_layer(title)
# Hidden layer
layer_1 = self.layer_0.dot(self.weights_0_1)
# Output layer
layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
### Backward pass ###
# Output error
layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)
# Backpropagated error
layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error
# Update the weights
self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step
# Keep track of correct predictions.
if(layer_2 >= 0.5 and (label == 'FAKE')):
correct_so_far += 1
elif(layer_2 < 0.5 and label == "REAL"):
correct_so_far += 1
# For debug purposes, print out our prediction accuracy and speed
# throughout the training process.
elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_titles)))[:4] \
+ "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
+ " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
+ " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
if(i % 2500 == 0):
print("")
def test(self, testing_titles, testing_labels):
"""
Attempts to predict the labels for the given titles,
and uses the test_labels to calculate the accuracy of those predictions.
"""
# keep track of how many correct predictions we make
correct = 0
# we'll time how many predictions per second we make
start = time.time()
# Loop through each of the given titles and call run to predict
# its label.
for i in range(len(testing_titles)):
pred = self.run(testing_titles[i])
if(pred == testing_titles[i]):
correct += 1
# For debug purposes, print out our prediction accuracy and speed
# throughout the prediction process.
elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_titles)))[:4] \
+ "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
+ " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
+ " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
def run(self, title):
"""
Returns a prediction for the given title.
"""
# Run a forward pass through the network, like in the "train" function.
# Input Layer
self.update_input_layer(title.lower())
# Hidden layer
layer_1 = self.layer_0.dot(self.weights_0_1)
# Output layer
layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
if(layer_2[0] >= 0.5):
return "FAKE"
else:
return "REAL"
In [28]:
%reset
# please reset everytime when you run multiple times
In [29]:
mlp = SentimentNetwork(titles[:-5000], types[:-5000], learning_rate=0.1)
mlp.train(titles[:-5000],types[:-5000])
# see if I can improve accuracy?
In [30]:
mlp.run('I love my mom')
Out[30]:
In [31]:
mlp.run('BREAKING : DOJ Says They Will “HELP†Review the 650K Emails – TruthFeed')
Out[31]: