In [1]:
import argparse
import os
import shutil
import gzip

import torch
import torch.nn as nn
from torch.autograd import Variable

import torch
import torch.nn as nn
from torch.autograd import Variable

from dpp_nets.utils.language import Vocabulary, BeerDataset, custom_collate
from dpp_nets.layers.layers import ChunkTrainer, ChunkTrainerRel

from dpp_nets.utils.language import EvalSet

In [2]:
from dpp_nets.utils.language import Vocabulary

embd_path = '/Users/Max/data/beer_reviews/review+wiki.filtered.200.txt.gz'
word_path = '/Users/Max/data/beer_reviews/reviews.all.train.words.txt.gz'

# Set-up Vocabulary
vocab = Vocabulary()
vocab.loadPretrained(embd_path)
vocab.setStops()
vocab.loadCorpus(word_path)
vocab.updateEmbedding()
vocab.setCuda(False)

In [3]:
# Load saved checkpoint
model = 'allwords1reg0.1reg_mean50.0lr0.001marginal_best_ckp.pth.tar'
model_dir = '/Users/Max/checkpoints/beer_reviews/marginal/' 
model_path = model_dir + model
model = torch.load(model_path, map_location=lambda storage, loc: storage)
vocab.EmbeddingBag.load_state_dict(model['embedding'])

In [4]:
EMBD_DIM = 200
KERNEL_DIM = 200
HIDDEN_DIM = 500
ENC_DIM = 200
TARGET_DIM = 3 if model['aspect'] in set(['all', 'short']) else 1

if model['mode'] == 'sents':
    trainer = ChunkTrainer(EMBD_DIM, HIDDEN_DIM, KERNEL_DIM, ENC_DIM, TARGET_DIM)
else:
    trainer = ChunkTrainerRel(EMBD_DIM, HIDDEN_DIM, KERNEL_DIM, ENC_DIM, TARGET_DIM)

trainer.load_state_dict(model['model'])
trainer.activation = nn.Sigmoid()
trainer.reg = model['reg']
trainer.reg_mean = model['reg_mean']

rat_path = '/Users/Max/data/beer_reviews/annotations.json'
evalset = EvalSet(rat_path, vocab)

In [5]:
# Plot a table
print('__________________________Training Table__________________________')
for k, v in model['train_loss'].items():
    epoch, loss, pred_loss, reg_loss = k, v[0], model['train_pred_loss'][k][0], model['train_reg_loss'][k][0]
    print(str.join(" | ", ['Epoch: %d' % (epoch), 'Loss: %.5f' % (loss), 
                              'Pred Loss: %.5f' % (pred_loss), 'Reg Loss: %.5f' % (reg_loss)]))


__________________________Training Table__________________________
Epoch: 0 | Loss: 0.02673 | Pred Loss: 0.02467 | Reg Loss: 0.00206
Epoch: 1 | Loss: 0.01947 | Pred Loss: 0.01924 | Reg Loss: 0.00023
Epoch: 2 | Loss: 0.01694 | Pred Loss: 0.01674 | Reg Loss: 0.00019
Epoch: 3 | Loss: 0.01467 | Pred Loss: 0.01448 | Reg Loss: 0.00019
Epoch: 4 | Loss: 0.01268 | Pred Loss: 0.01249 | Reg Loss: 0.00020
Epoch: 5 | Loss: 0.01110 | Pred Loss: 0.01090 | Reg Loss: 0.00020
Epoch: 6 | Loss: 0.00981 | Pred Loss: 0.00960 | Reg Loss: 0.00021

In [6]:
from dpp_nets.helper.plotting import plot_floats

# Training Plots
plot_floats(model['train_loss'], xlabel='Epochs', ylabel='MSE + Reg', title='Training MSE + Reg')
plot_floats(model['train_pred_loss'], xlabel='Epochs', ylabel='MSE', title='Training MSE')
plot_floats(model['train_reg_loss'], xlabel='Epochs', ylabel='Reg', title='Training Reg')



In [7]:
print('_________________________Validation Table_________________________')
for k, v in model['val_loss'].items():
    epoch, loss, pred_loss, reg_loss = k, v[0], model['val_pred_loss'][k][0], model['val_reg_loss'][k][0]
    print(str.join(" | ", ['Epoch: %d' % (epoch), 'Loss: %.5f' % (loss), 
                              'Pred Loss: %.5f' % (pred_loss), 'Reg Loss: %.5f' % (reg_loss)]))


_________________________Validation Table_________________________
Epoch: 0 | Loss: 0.02213 | Pred Loss: 0.02185 | Reg Loss: 0.00028
Epoch: 1 | Loss: 0.01928 | Pred Loss: 0.01909 | Reg Loss: 0.00019
Epoch: 2 | Loss: 0.01864 | Pred Loss: 0.01846 | Reg Loss: 0.00018
Epoch: 3 | Loss: 0.01821 | Pred Loss: 0.01802 | Reg Loss: 0.00019
Epoch: 4 | Loss: 0.01823 | Pred Loss: 0.01803 | Reg Loss: 0.00020
Epoch: 5 | Loss: 0.01845 | Pred Loss: 0.01824 | Reg Loss: 0.00021
Epoch: 6 | Loss: 0.01813 | Pred Loss: 0.01792 | Reg Loss: 0.00021

In [8]:
from dpp_nets.helper.plotting import plot_floats

# Training Plots
plot_floats(model['val_loss'], xlabel='Epochs', ylabel='MSE + Reg', title='Validation MSE + Reg')
plot_floats(model['val_pred_loss'], xlabel='Epochs', ylabel='MSE', title='Validation MSE')
plot_floats(model['val_reg_loss'], xlabel='Epochs', ylabel='Reg', title='Validation Reg')



In [9]:
# Evaluation on Test Set

loss, pred_loss, reg_loss = evalset.computeLoss(trainer, model['mode'])
print(str.join(" | ", ['Test Set:', 'Loss: %.5f' % (loss), 
                              'Pred Loss: %.5f' % (pred_loss), 'Reg Loss: %.5f' % (reg_loss)]))

prec, extract = evalset.evaluatePrecision(trainer,model['mode'])
print(str.join(" | ", ['Test Set:', 'Precision: %.5f' % (prec), 'Extract: %.5f' % (extract)]))


Test Set: | Loss: 0.01445 | Pred Loss: 0.01421 | Reg Loss: 0.00023
Test Set: | Precision: 0.51603 | Extract: 0.19967

In [10]:
# Random Samples
evalset.sample(trainer, model['mode'])


index is: 713
('tropical',) set() [('tropical',)]
('great',) set() [('great',)]
('blast',) {'1'} [('blast',)]
('retention',) {'0'} [('retention',)]
('chalky',) {'2'} [('chalky',)]
('cocoa',) set() [('cocoa',)]
('quite',) set() [('quite',)]
('minimal',) {'0'} [('minimal',)]
('dose',) {'1'} [('dose',)]
('blend',) set() [('blend',)]
('citric',) {'1'} [('citric',)]
('weak',) {'0'} [('weak',)]
("'s",) {'0'} [("'s",)]
('not',) set() [('not',)]
('bread',) {'1'} [('bread',)]
('bodied',) {'2'} [('bodied',)]
('pale',) {'1'} [('pale',)]
('opaque',) {'0'} [('opaque',)]
Precision is: 0.6666666666666666
Extraction Percentage is: 0.1651376146788991
[(Appearance: Golden Honey., {'0'}), (Slightly Orange., {'0'}), (It's a little opaque., {'0'}), (It has a weak head with minimal retention.		, {'0'}), (Smell: A huge dose of bread pale malts., {'1'}), (There is a huge blast of very citric hops. 		, {'1'}), (Taste: Wow., set()), (This taste great., set()), (It has a good blend of cocoa malts., set()), (Very bitter piney hops, set()), (. Slight tropical fruit sweetness., set()), (Apricot., set()), (Pineapple., set()), (Very bitter finish.		, set()), (Mouthfeel: Medium bodied with a chalky finish.		, {'2'}), (Drinkability: Good but not quite as enjoyable as the Pliny the Elder which she had before., set())]

 0.7302  0.8185  0.6234
[torch.FloatTensor of size 1x3]
 
 0.8000
 0.8000
 0.8000
[torch.FloatTensor of size 3]

Loss: 0.01233369205147028 Pred Loss 0.012130477465689182 Reg Loss 0.00020321480405982584

In [11]:
# Random Marginals
evalset.computeMarginals(trainer, model['mode'],100)


index is: 100
0 0.770350129435 ('ripe',)
1 0.76782370539 ('come',)
2 0.71575437221 ('back',)
3 0.71219528814 ('relatively',)
4 0.71084157174 ('though',)
5 0.680593881571 ('still',)
6 0.678586982593 ('500',)
7 0.662485916099 ('ml',)
8 0.662478459336 ('use',)
9 0.636493719437 ('cloudy',)
10 0.628845142506 ('bubblegum',)
11 0.622096185417 ('refreshing',)
12 0.603468649948 ('looking',)
13 0.600377118751 ('weissbiers',)
14 0.599122046458 ('outstanding',)
15 0.58354977317 ('large',)
16 0.565666823697 ('much',)
17 0.529512168043 ('great',)
18 0.507770733966 ('bite',)
19 0.496301897894 ('bananas',)
20 0.494000640966 ('flavours',)
21 0.48781272185 ('hefe',)
22 0.487510963234 ('bodied',)
23 0.470835073835 ('big',)
24 0.459549810829 ('aftertaste',)
25 0.45512238827 ('available',)
26 0.453109484977 ('golden',)
27 0.440644847681 ('cheap',)
28 0.414031138418 ('cloves',)
29 0.392302523984 ('could',)
30 0.38328965741 ('time',)
31 0.376689349602 ('flavour',)
32 0.371790164574 ('tasty',)
33 0.363069146181 ('strong',)
34 0.353564831133 ('not',)
35 0.338207274669 ('light',)
36 0.319369823754 ('bottle',)
37 0.254406423602 ('head',)
38 0.199447981049 ('aroma',)

In [12]:
evalset.computeMAPPredLoss(trainer, model['mode'])


Out[12]:
(0.031056680142549434, 0.19966675073322993)

In [13]:
evalset.computeMUEPredLoss(trainer, model['mode'])


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-13-560031173927> in <module>()
----> 1 evalset.computeMUEPredLoss(trainer, model['mode'])

TypeError: computeMUEPredLoss() missing 1 required positional argument: 'n_runs'

In [ ]:
evalset.create_MAPS(trainer, model['mode'])

In [ ]:
good = []
bad = []

for m, t in evalset.MAPS:
    if t > 0.9:
        for tup in m:
            for word in tup:
                good.append(word)
    if t < 0.45:
        for tup in m:
            for word in tup:
                bad.append(word)

In [ ]:
l = []
for m, t in evalset.MAPS:
    for tup in m:
        for word in tup:
            l.append(word)

In [ ]:
from collections import Counter
import operator

all_revs = Counter(l)
good_revs = Counter(good)
bad_revs = Counter(bad)

sorted_good = sorted(good_revs.items(), key=operator.itemgetter(1))
sorted_good.reverse()

sorted_bad = sorted(bad_revs.items(), key=operator.itemgetter(1))
sorted_bad.reverse()

sorted_all = sorted(all_revs.items(), key=operator.itemgetter(1))
sorted_all.reverse()

In [ ]:
n_words = 20
for i in range(n_words):
    word, count = sorted_good[i]
    print(word, count)

In [ ]:
from dpp_nets.utils.language import Vocabulary, BeerDataset, simple_collate, custom_collate_reinforce, custom_collate

val_path = '/Users/Max/data/beer_reviews/' + 'reviews.' + model['aspect'] + '.heldout.' + model['mode'] + '.txt.gz'
val_set = BeerDataset(val_path)
val_loader = torch.utils.data.DataLoader(val_set, collate_fn=simple_collate, batch_size=1)

def compute_word_clouds(trainer, good_thres, bad_thres):
    
    words, target = custom_collate_reinforce(batch, vocab, args.alpha_iter, args.cuda)
    target_mean = target.data.mean()
    
    # Compute MAP
    kernel, _ = trainer.kernel_net(words)
    L = (kernel.data.mm(kernel.data.t())).numpy()
    return_ixs = computeMAP(L)

    MAP = []

    for i in return_ixs:
        txt = list(review.clean.keys())[i]
        label = list(review.clean.values())[i]
        rat = review.rev[txt]
        MAP.extend(rat)
    
    good = []
    bad = []

In [ ]:
for batch in val_loader:
    print(batch)

In [ ]:
review, target = batch[0]['review'], batch[0]['target']

In [ ]:
custom_collate(batch, evalset.vocab)

In [ ]:
# Count sizes - no tensor operations

# Map to Embeddings
rep = vocab.returnEmbds(d['review'])
rep

In [ ]:
evalset.words[0][2]

In [ ]:
torch.mean(torch.var(torch.stack(evalset.targets),0))

In [49]:
import numpy as np
sparsity = np.array([0.04, 0.08, 0.11, 0.19, 0.24, 0.3, 0.5, 1])
performance = np.array([0.014, 0.0136, 0.0127, 0.0111, 0.01123 , 0.0105, 0.0095, 0.0092])
SVMy = [0.015, 0.015]
SVMx = [0, 1]
constantx = [0, 1]
constanty = [0.018, 0.018]

In [50]:
import matplotlib.pyplot as plt


plt.plot(sparsity, performance, marker='o', color='g', label='DPP Net (MUE Prediction)')
plt.plot(SVMx, SVMy, label='SVM')
plt.plot(constantx, constanty, label='Constant', color='b')
plt.ylabel('MSE')
plt.xlabel('% of extracted text')
plt.xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0], ['0%', '20%', '40%','60%','80%','100%'])
plt.title('Sparsity vs Performance')
plt.legend()
plt.show()



In [ ]: