In [ ]:
# Let's load up the Yelp review dataset, 
# an array of JSON structures

# Grab the data and progress bar
# We only need to do this once.
!pip install -Iv bokeh==0.13.0
import codecs
from io import open
!wget https://storage.googleapis.com/aai17/yelp_dataset.tar
!tar xfvz yelp_dataset.tar
!mv dataset/review.json yelp_reviews.json

In [1]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()


Loading BokehJS ...

In [2]:
from tqdm import tqdm
from collections import Counter
from datetime import datetime
import json

t1 = datetime.now()
print("Loading...")
with open("yelp_reviews.json", "r", encoding="utf-8") as f:
    reviews = f.read().strip().split("\n")
reviews = [json.loads(review) for review in tqdm(reviews)]
print("Loaded ", len(reviews), "reviews in ", datetime.now() - t1)

print(reviews[0]['text'], "\n\nRating: ", reviews[0]['stars'],"stars")


Loading...
100%|██████████| 5261669/5261669 [00:45<00:00, 116165.09it/s]
Loaded  5261669 reviews in  0:01:22.839652
Love the staff, love the meat, love the place. Prepare for a long line around lunch or dinner hours. 

They ask you how you want you meat, lean or something maybe, I can't remember. Just say you don't want it too fatty. 

Get a half sour pickle and a hot pepper. Hand cut french fries too. 

Rating:  5 stars

In [3]:
# choose a random subset of reviews
count = 100000
import numpy as np
import re

np.random.seed(1)
positive = []
negative = []
all_reviews = np.array(reviews)
np.random.shuffle(all_reviews)
notalpha = re.compile('[^a-zA-Z ]')

def tokenize(text):
    return notalpha.sub('',text).lower().strip()
    
for review in tqdm(all_reviews):
    neg = review['stars'] < 3
    pos = review['stars'] > 3
    text = tokenize(review['text'])
    if neg and len(negative) < count:
        negative.append(text)
    elif pos and len(positive) < count:
        positive.append(text)
    if len(negative) >= count and len(positive) >= count:
        break

print("Selected ",len(positive),"positive and",len(negative),"negative reviews")


  9%|▊         | 449324/5261669 [00:06<01:09, 69264.96it/s]
Selected  100000 positive and 100000 negative reviews


In [4]:
# clean up memory
print("Please stand by...")
reviews = []
all_reviews = []
print("Memory cleared")


Please stand by...
Memory cleared

In [5]:
from collections import Counter

positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
pos_neg_ratios = Counter()
polarity_cutoff = 0.2
min_count = 50

print("Gathering positive words")
for review in tqdm(positive):
    for word in review.split(" "):
        positive_counts[word] += 1
        total_counts[word] += 1

print("Gathering negative words")
for review in tqdm(negative):
    for word in review.split(" "):
        negative_counts[word] += 1
        total_counts[word] += 1
        
print("Creating influence ratios of frequent words")
for term,cnt in list(total_counts.most_common()):
    if(cnt >= min_count):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

for word,ratio in pos_neg_ratios.most_common():
    if(ratio > 1):
        pos_neg_ratios[word] = np.log(ratio)
    else:
        pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))

print("Creating training vocabulary")
review_vocab = set()
for word,p in pos_neg_ratios.most_common():
    if (p >= polarity_cutoff or p <= -polarity_cutoff):
        review_vocab.add(word)
        
print("Found",len(review_vocab),"words")


  1%|          | 964/100000 [00:00<00:10, 9635.25it/s]
Gathering positive words
100%|██████████| 100000/100000 [00:10<00:00, 9626.41it/s]
  1%|          | 672/100000 [00:00<00:14, 6718.00it/s]
Gathering negative words
100%|██████████| 100000/100000 [00:15<00:00, 6568.77it/s]
Creating influence ratios of frequent words
Creating training vocabulary
Found 9541 words

In [6]:
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file

p1 = figure(title="Discriminating Word Distribution",tools="save",
            background_fill_color="#E8DDCB")

measured = []
for word in review_vocab:
    measured.append(pos_neg_ratios[word])
    
measured = np.array(measured)
hist, edges = np.histogram(measured, density=True, bins=50)

p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")

p1.legend.location = "center_right"
p1.legend.background_fill_color = "darkgrey"
p1.xaxis.axis_label = 'log(pos/neg ratio)'
p1.yaxis.axis_label = 'relative count'

p2 = figure(title="Raw Word Distribution",tools="save",
            background_fill_color="#E8DDCB")

measured = []
for word in total_counts:
    measured.append(total_counts[word])
    
measured = np.array(measured)
hist, edges = np.histogram(measured, density=True, bins=50)

p2.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")

p2.legend.location = "center_right"
p2.legend.background_fill_color = "darkgrey"
p2.xaxis.axis_label = 'word frequency'
p2.yaxis.axis_label = 'relative count'

p3 = figure(title="Min Count Word Distribution",tools="save",
            background_fill_color="#E8DDCB")

measured = []
for word in total_counts:
    c = total_counts[word]
    if c > min_count:
        measured.append(c)
    
measured = np.array(measured)
hist, edges = np.histogram(measured, density=True, bins=50)

p3.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")

p3.legend.location = "center_right"
p3.legend.background_fill_color = "darkgrey"
p3.xaxis.axis_label = 'word frequency (above cutoff)'
p3.yaxis.axis_label = 'relative count'

p4 = figure(title="Pos/Neg Distribution",tools="save",
            background_fill_color="#E8DDCB")

measured = []
for term,cnt in list(total_counts.most_common()):
    if(cnt >= min_count):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        measured.append(pos_neg_ratio)
    
measured = np.array(measured)
hist, edges = np.histogram(measured, density=True, bins=50)

p4.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")

p4.legend.location = "center_right"
p4.legend.background_fill_color = "darkgrey"
p4.xaxis.axis_label = 'pos/neg ratio'
p4.yaxis.axis_label = 'relative count'

show(gridplot(p2,p3,p4,p1, ncols=2, plot_width=400, plot_height=400, toolbar_location=None))



In [7]:
# create mappings from words to numbers and vice versa

word2index = {}
index2word = {}
for i, word in enumerate(review_vocab):
    word2index[word] = i
    index2word[i] = word

n = len(review_vocab)
samples = len(positive)+len(negative)
all_words = word2index.keys()

# encode 1-hot reviews
x = np.zeros((samples, n))
y = np.zeros((samples,2))

idx = -1
print("Creating 1-hot positive encodings")
for review in tqdm(positive):
    idx += 1
    for word in review.split(" "):
        if word in all_words:
            x[idx, word2index[word]] = 1
            y[idx,0] = 1
 
print("Creating 1-hot negative encodings")
for review in tqdm(negative):
    idx += 1
    for word in review.split(" "):
        if word in all_words:
            x[idx, word2index[word]] = 1
            y[idx,1] = 1


  2%|▏         | 2328/100000 [00:00<00:08, 11632.00it/s]
Creating 1-hot positive encodings
100%|██████████| 100000/100000 [00:08<00:00, 11171.87it/s]
  1%|          | 886/100000 [00:00<00:11, 8838.60it/s]
Creating 1-hot negative encodings
100%|██████████| 100000/100000 [00:12<00:00, 8294.29it/s]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

In [9]:
import tensorflow as tf

# We'll bundle groups of examples during training for efficiency.
# This defines the size of the batch.
BATCH_SIZE = 100
VOCAB_SIZE = len(review_vocab)
EMBEDDING_SIZE = 64
NUM_LABELS = 2
NUM_GPUS = 2
LEARNING_RATE = 0.0005
DISPLAY_STEP = 100
NUM_STEPS = 2000

# The random seed that defines initialization.
SEED = 42

def model(x, prefix='model', reuse=True, is_training=True):
    # Define a scope for reusing the variables
    with tf.variable_scope('Model', reuse=reuse):
        nn = tf.layers.dense(x, EMBEDDING_SIZE, activation=tf.nn.sigmoid, name=prefix+'_embedding')
        nn = tf.layers.dense(nn, NUM_LABELS, activation=tf.nn.sigmoid, name=prefix+'_logits')
        # We only apply need to apply softmax to testing network
        out = tf.nn.softmax(nn) if not is_training else nn
        return out

print('Done')


Done

In [10]:
# Build the function to average the gradients
def average_gradients(tower_grads):
    average_grads = []
    
   # print(tower_grads)

    for grad_and_vars in zip(*tower_grads):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        grads = []
        for g, _ in grad_and_vars:
            # Add 0 dimension to the gradients to represent the tower.
            expanded_g = tf.expand_dims(g, 0)

            # Append on a 'tower' dimension which we will average over below.
            grads.append(expanded_g)

        # Average over the 'tower' dimension.
        grad = tf.concat(grads, 0)
        grad = tf.reduce_mean(grad, 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads

print('Done')


Done

In [11]:
# wire everything up
# By default, all variables will be placed on '/gpu:0'
# So we need a custom device function, to assign all variables to '/cpu:0'
# Note: If GPUs are peered, '/gpu:0' can be a faster option
PS_OPS = ['Variable', 'VariableV2', 'AutoReloadVariable']

def assign_to_device(device, ps_device='/cpu:0'):
    def _assign(op):
        node_def = op if isinstance(op, tf.NodeDef) else op.node_def
        if node_def.op in PS_OPS:
            return "/" + ps_device
        else:
            return device

    return _assign

print('Done')


Done

In [12]:
import time

tf.reset_default_graph()

NUM_STEPS=1000

# Place all ops on CPU by default
with tf.device('/cpu:0'):
    tower_grads = []
    reuse_vars = False
    #reuse_vars = True
    num_samples = X_train.shape[0]

    # tf Graph input
    X = tf.placeholder(tf.float32, [None, VOCAB_SIZE])
    Y = tf.placeholder(tf.float32, [None, NUM_LABELS])

    # Loop over all GPUs and construct their own computation graph
    for i in range(NUM_GPUS):
        with tf.device(assign_to_device('/gpu:{}'.format(i), ps_device='/cpu:0')):

            # Split data between GPUs
            _x = X[i * BATCH_SIZE: (i+1) * BATCH_SIZE]
            _y = Y[i * BATCH_SIZE: (i+1) * BATCH_SIZE]

            # Because Dropout have different behavior at training and prediction time, we
            # need to create 2 distinct computation graphs that share the same weights.

            # Create a graph for training
            logits_train = model(_x, reuse=reuse_vars, prefix="yelp")
            
            # Create another graph for testing that reuse the same weights
            logits_test = model(_x, reuse=True, prefix="yelp", is_training=False)

            # Define loss and optimizer (with train logits, for dropout to take effect)
            loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                logits=logits_train, labels=_y))
            optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
            grads = optimizer.compute_gradients(loss_op)
            
            print("GPU",i,"configured")
            
            # Only first GPU compute accuracy
            if i == 0:
                # Evaluate model (with test logits, for dropout to be disabled)
                correct_pred = tf.equal(tf.argmax(logits_test, 1), tf.argmax(_y, 1))
                accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

            reuse_vars = True
            tower_grads.append(grads)

    tower_grads = average_gradients(tower_grads)
    train_op = optimizer.apply_gradients(tower_grads)
    
    # Initializing the variables
    init = tf.global_variables_initializer()

    # Launch the graph
    with tf.Session() as sess:
        sess.run(init)
        step = 1
        # Keep training until reach max iterations
        for step in range(1, NUM_STEPS + 1):
            # Get a batch for each GPU
            indices = np.random.choice(num_samples, BATCH_SIZE*NUM_GPUS)
            batch_x = X_train[indices]
            batch_y = y_train[indices]
            
            # Run optimization op (backprop)
            ts = time.time()
            sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
            te = time.time() - ts
            if step % DISPLAY_STEP == 0 or step == 1:
                # Calculate batch loss and accuracy
                loss, acc = sess.run([loss_op, accuracy], feed_dict={X: batch_x,
                                                                     Y: batch_y})
                print("Step " + str(step) + ": Minibatch Loss= " + \
                      "{:.4f}".format(loss) + ", Training Accuracy= " + \
                      "{:.3f}".format(acc) + ", %i Examples/sec" % int(len(batch_x)/te))
            step += 1
        print("Optimization Finished!")
        
        graph = tf.get_default_graph()
        t1 = graph.get_tensor_by_name('Model/yelp_embedding/kernel:0')
        embeddings = np.array(sess.run(t1))

        # Calculate accuracy for 1000 mnist test images
        print("Testing Accuracy:", \
            np.mean([sess.run(accuracy, feed_dict={X: X_test[i:i+BATCH_SIZE],
            Y: y_test[i:i+BATCH_SIZE]}) for i in range(0, X_test.shape[0], BATCH_SIZE)]))


WARNING:tensorflow:From <ipython-input-12-33ad52df70a9>:37: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

GPU 0 configured
GPU 1 configured
Step 1: Minibatch Loss= 0.6962, Training Accuracy= 0.600, 143 Examples/sec
Step 100: Minibatch Loss= 0.5726, Training Accuracy= 0.810, 14454 Examples/sec
Step 200: Minibatch Loss= 0.5195, Training Accuracy= 0.890, 13153 Examples/sec
Step 300: Minibatch Loss= 0.4557, Training Accuracy= 0.940, 12475 Examples/sec
Step 400: Minibatch Loss= 0.4271, Training Accuracy= 0.950, 13539 Examples/sec
Step 500: Minibatch Loss= 0.4394, Training Accuracy= 0.930, 13104 Examples/sec
Step 600: Minibatch Loss= 0.3987, Training Accuracy= 0.970, 13046 Examples/sec
Step 700: Minibatch Loss= 0.4068, Training Accuracy= 0.940, 13251 Examples/sec
Step 800: Minibatch Loss= 0.3721, Training Accuracy= 0.930, 13309 Examples/sec
Step 900: Minibatch Loss= 0.3927, Training Accuracy= 0.980, 13131 Examples/sec
Step 1000: Minibatch Loss= 0.3614, Training Accuracy= 0.960, 13528 Examples/sec
Optimization Finished!
Testing Accuracy: 0.9358

In [13]:
def get_most_similar_words(focus = "love"):
    keys = word2index.keys()
    if focus not in keys:
        print("Sorry, word not found")
        return
    
    most_similar = Counter()
    for word in word2index.keys():
        most_similar[word] = np.dot(embeddings[word2index[word]],
                                    embeddings[word2index[focus]])
    
    return most_similar.most_common()[0:10]

get_most_similar_words('yummy')


Out[13]:
[('amazing', 4.0019403),
 ('delicious', 3.989573),
 ('awesome', 3.978746),
 ('excellent', 3.960814),
 ('great', 3.8401873),
 ('perfect', 3.6464357),
 ('fantastic', 3.565358),
 ('love', 3.2784226),
 ('best', 3.2371793),
 ('thank', 3.1843255)]

In [14]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
    if(word in word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
    if(word in word2index.keys()):
        words_to_visualize.append(word)

In [15]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in pos_neg_ratios.keys():
        vectors_list.append(embeddings[word2index[word]])
        if(pos_neg_ratios[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")
            
print("Pos",pos,"neg",neg)


Pos 500 neg 500

In [16]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)
print("TSNE visualization ready")


TSNE visualization ready

In [17]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize,
                                    color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)



In [ ]: