In [ ]:
# Let's load up the Yelp review dataset,
# an array of JSON structures
# Grab the data and progress bar
# We only need to do this once.
!pip install -Iv bokeh==0.13.0
import codecs
from io import open
!wget https://storage.googleapis.com/aai17/yelp_dataset.tar
!tar xfvz yelp_dataset.tar
!mv dataset/review.json yelp_reviews.json
In [1]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()
In [2]:
from tqdm import tqdm
from collections import Counter
from datetime import datetime
import json
t1 = datetime.now()
print("Loading...")
with open("yelp_reviews.json", "r", encoding="utf-8") as f:
reviews = f.read().strip().split("\n")
reviews = [json.loads(review) for review in tqdm(reviews)]
print("Loaded ", len(reviews), "reviews in ", datetime.now() - t1)
print(reviews[0]['text'], "\n\nRating: ", reviews[0]['stars'],"stars")
In [3]:
# choose a random subset of reviews
count = 100000
import numpy as np
import re
np.random.seed(1)
positive = []
negative = []
all_reviews = np.array(reviews)
np.random.shuffle(all_reviews)
notalpha = re.compile('[^a-zA-Z ]')
def tokenize(text):
return notalpha.sub('',text).lower().strip()
for review in tqdm(all_reviews):
neg = review['stars'] < 3
pos = review['stars'] > 3
text = tokenize(review['text'])
if neg and len(negative) < count:
negative.append(text)
elif pos and len(positive) < count:
positive.append(text)
if len(negative) >= count and len(positive) >= count:
break
print("Selected ",len(positive),"positive and",len(negative),"negative reviews")
In [4]:
# clean up memory
print("Please stand by...")
reviews = []
all_reviews = []
print("Memory cleared")
In [5]:
from collections import Counter
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
pos_neg_ratios = Counter()
polarity_cutoff = 0.2
min_count = 50
print("Gathering positive words")
for review in tqdm(positive):
for word in review.split(" "):
positive_counts[word] += 1
total_counts[word] += 1
print("Gathering negative words")
for review in tqdm(negative):
for word in review.split(" "):
negative_counts[word] += 1
total_counts[word] += 1
print("Creating influence ratios of frequent words")
for term,cnt in list(total_counts.most_common()):
if(cnt >= min_count):
pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
pos_neg_ratios[term] = pos_neg_ratio
for word,ratio in pos_neg_ratios.most_common():
if(ratio > 1):
pos_neg_ratios[word] = np.log(ratio)
else:
pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
print("Creating training vocabulary")
review_vocab = set()
for word,p in pos_neg_ratios.most_common():
if (p >= polarity_cutoff or p <= -polarity_cutoff):
review_vocab.add(word)
print("Found",len(review_vocab),"words")
In [6]:
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file
p1 = figure(title="Discriminating Word Distribution",tools="save",
background_fill_color="#E8DDCB")
measured = []
for word in review_vocab:
measured.append(pos_neg_ratios[word])
measured = np.array(measured)
hist, edges = np.histogram(measured, density=True, bins=50)
p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
fill_color="#036564", line_color="#033649")
p1.legend.location = "center_right"
p1.legend.background_fill_color = "darkgrey"
p1.xaxis.axis_label = 'log(pos/neg ratio)'
p1.yaxis.axis_label = 'relative count'
p2 = figure(title="Raw Word Distribution",tools="save",
background_fill_color="#E8DDCB")
measured = []
for word in total_counts:
measured.append(total_counts[word])
measured = np.array(measured)
hist, edges = np.histogram(measured, density=True, bins=50)
p2.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
fill_color="#036564", line_color="#033649")
p2.legend.location = "center_right"
p2.legend.background_fill_color = "darkgrey"
p2.xaxis.axis_label = 'word frequency'
p2.yaxis.axis_label = 'relative count'
p3 = figure(title="Min Count Word Distribution",tools="save",
background_fill_color="#E8DDCB")
measured = []
for word in total_counts:
c = total_counts[word]
if c > min_count:
measured.append(c)
measured = np.array(measured)
hist, edges = np.histogram(measured, density=True, bins=50)
p3.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
fill_color="#036564", line_color="#033649")
p3.legend.location = "center_right"
p3.legend.background_fill_color = "darkgrey"
p3.xaxis.axis_label = 'word frequency (above cutoff)'
p3.yaxis.axis_label = 'relative count'
p4 = figure(title="Pos/Neg Distribution",tools="save",
background_fill_color="#E8DDCB")
measured = []
for term,cnt in list(total_counts.most_common()):
if(cnt >= min_count):
pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
measured.append(pos_neg_ratio)
measured = np.array(measured)
hist, edges = np.histogram(measured, density=True, bins=50)
p4.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
fill_color="#036564", line_color="#033649")
p4.legend.location = "center_right"
p4.legend.background_fill_color = "darkgrey"
p4.xaxis.axis_label = 'pos/neg ratio'
p4.yaxis.axis_label = 'relative count'
show(gridplot(p2,p3,p4,p1, ncols=2, plot_width=400, plot_height=400, toolbar_location=None))
In [7]:
# create mappings from words to numbers and vice versa
word2index = {}
index2word = {}
for i, word in enumerate(review_vocab):
word2index[word] = i
index2word[i] = word
n = len(review_vocab)
samples = len(positive)+len(negative)
all_words = word2index.keys()
# encode 1-hot reviews
x = np.zeros((samples, n))
y = np.zeros((samples,2))
idx = -1
print("Creating 1-hot positive encodings")
for review in tqdm(positive):
idx += 1
for word in review.split(" "):
if word in all_words:
x[idx, word2index[word]] = 1
y[idx,0] = 1
print("Creating 1-hot negative encodings")
for review in tqdm(negative):
idx += 1
for word in review.split(" "):
if word in all_words:
x[idx, word2index[word]] = 1
y[idx,1] = 1
In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)
In [9]:
import tensorflow as tf
# We'll bundle groups of examples during training for efficiency.
# This defines the size of the batch.
BATCH_SIZE = 100
VOCAB_SIZE = len(review_vocab)
EMBEDDING_SIZE = 64
NUM_LABELS = 2
NUM_GPUS = 2
LEARNING_RATE = 0.0005
DISPLAY_STEP = 100
NUM_STEPS = 2000
# The random seed that defines initialization.
SEED = 42
def model(x, prefix='model', reuse=True, is_training=True):
# Define a scope for reusing the variables
with tf.variable_scope('Model', reuse=reuse):
nn = tf.layers.dense(x, EMBEDDING_SIZE, activation=tf.nn.sigmoid, name=prefix+'_embedding')
nn = tf.layers.dense(nn, NUM_LABELS, activation=tf.nn.sigmoid, name=prefix+'_logits')
# We only apply need to apply softmax to testing network
out = tf.nn.softmax(nn) if not is_training else nn
return out
print('Done')
In [10]:
# Build the function to average the gradients
def average_gradients(tower_grads):
average_grads = []
# print(tower_grads)
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(grads, 0)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
print('Done')
In [11]:
# wire everything up
# By default, all variables will be placed on '/gpu:0'
# So we need a custom device function, to assign all variables to '/cpu:0'
# Note: If GPUs are peered, '/gpu:0' can be a faster option
PS_OPS = ['Variable', 'VariableV2', 'AutoReloadVariable']
def assign_to_device(device, ps_device='/cpu:0'):
def _assign(op):
node_def = op if isinstance(op, tf.NodeDef) else op.node_def
if node_def.op in PS_OPS:
return "/" + ps_device
else:
return device
return _assign
print('Done')
In [12]:
import time
tf.reset_default_graph()
NUM_STEPS=1000
# Place all ops on CPU by default
with tf.device('/cpu:0'):
tower_grads = []
reuse_vars = False
#reuse_vars = True
num_samples = X_train.shape[0]
# tf Graph input
X = tf.placeholder(tf.float32, [None, VOCAB_SIZE])
Y = tf.placeholder(tf.float32, [None, NUM_LABELS])
# Loop over all GPUs and construct their own computation graph
for i in range(NUM_GPUS):
with tf.device(assign_to_device('/gpu:{}'.format(i), ps_device='/cpu:0')):
# Split data between GPUs
_x = X[i * BATCH_SIZE: (i+1) * BATCH_SIZE]
_y = Y[i * BATCH_SIZE: (i+1) * BATCH_SIZE]
# Because Dropout have different behavior at training and prediction time, we
# need to create 2 distinct computation graphs that share the same weights.
# Create a graph for training
logits_train = model(_x, reuse=reuse_vars, prefix="yelp")
# Create another graph for testing that reuse the same weights
logits_test = model(_x, reuse=True, prefix="yelp", is_training=False)
# Define loss and optimizer (with train logits, for dropout to take effect)
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=logits_train, labels=_y))
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
grads = optimizer.compute_gradients(loss_op)
print("GPU",i,"configured")
# Only first GPU compute accuracy
if i == 0:
# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(logits_test, 1), tf.argmax(_y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
reuse_vars = True
tower_grads.append(grads)
tower_grads = average_gradients(tower_grads)
train_op = optimizer.apply_gradients(tower_grads)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
step = 1
# Keep training until reach max iterations
for step in range(1, NUM_STEPS + 1):
# Get a batch for each GPU
indices = np.random.choice(num_samples, BATCH_SIZE*NUM_GPUS)
batch_x = X_train[indices]
batch_y = y_train[indices]
# Run optimization op (backprop)
ts = time.time()
sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
te = time.time() - ts
if step % DISPLAY_STEP == 0 or step == 1:
# Calculate batch loss and accuracy
loss, acc = sess.run([loss_op, accuracy], feed_dict={X: batch_x,
Y: batch_y})
print("Step " + str(step) + ": Minibatch Loss= " + \
"{:.4f}".format(loss) + ", Training Accuracy= " + \
"{:.3f}".format(acc) + ", %i Examples/sec" % int(len(batch_x)/te))
step += 1
print("Optimization Finished!")
graph = tf.get_default_graph()
t1 = graph.get_tensor_by_name('Model/yelp_embedding/kernel:0')
embeddings = np.array(sess.run(t1))
# Calculate accuracy for 1000 mnist test images
print("Testing Accuracy:", \
np.mean([sess.run(accuracy, feed_dict={X: X_test[i:i+BATCH_SIZE],
Y: y_test[i:i+BATCH_SIZE]}) for i in range(0, X_test.shape[0], BATCH_SIZE)]))
In [13]:
def get_most_similar_words(focus = "love"):
keys = word2index.keys()
if focus not in keys:
print("Sorry, word not found")
return
most_similar = Counter()
for word in word2index.keys():
most_similar[word] = np.dot(embeddings[word2index[word]],
embeddings[word2index[focus]])
return most_similar.most_common()[0:10]
get_most_similar_words('yummy')
Out[13]:
In [14]:
import matplotlib.colors as colors
words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
if(word in word2index.keys()):
words_to_visualize.append(word)
for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
if(word in word2index.keys()):
words_to_visualize.append(word)
In [15]:
pos = 0
neg = 0
colors_list = list()
vectors_list = list()
for word in words_to_visualize:
if word in pos_neg_ratios.keys():
vectors_list.append(embeddings[word2index[word]])
if(pos_neg_ratios[word] > 0):
pos+=1
colors_list.append("#00ff00")
else:
neg+=1
colors_list.append("#000000")
print("Pos",pos,"neg",neg)
In [16]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)
print("TSNE visualization ready")
In [17]:
p = figure(tools="pan,wheel_zoom,reset,save",
toolbar_location="above",
title="vector T-SNE for most polarized words")
source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
x2=words_top_ted_tsne[:,1],
names=words_to_visualize,
color=colors_list))
p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")
word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
text_font_size="8pt", text_color="#555555",
source=source, text_align='center')
p.add_layout(word_labels)
show(p)
In [ ]: