In [1]:
import argparse
import os
import shutil
import gzip
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch
import torch.nn as nn
from torch.autograd import Variable
from dpp_nets.utils.language import Vocabulary, BeerDataset, custom_collate
from dpp_nets.layers.layers import ChunkTrainer, ChunkTrainerRel
from dpp_nets.utils.language import EvalSet
In [2]:
from dpp_nets.utils.language import Vocabulary
embd_path = '/Users/Max/data/beer_reviews/review+wiki.filtered.200.txt.gz'
word_path = '/Users/Max/data/beer_reviews/reviews.all.train.words.txt.gz'
# Set-up Vocabulary
vocab = Vocabulary()
vocab.loadPretrained(embd_path)
vocab.setStops()
vocab.loadCorpus(word_path)
vocab.updateEmbedding()
vocab.setCuda(False)
In [3]:
# Load saved checkpoint
model = 'allwords1reg0.1reg_mean20.0lr0.001marginal_best_ckp.pth.tar'
model_dir = '/Users/Max/checkpoints/beer_reviews/marginal/'
model_path = model_dir + model
model = torch.load(model_path, map_location=lambda storage, loc: storage)
vocab.EmbeddingBag.load_state_dict(model['embedding'])
In [4]:
EMBD_DIM = 200
KERNEL_DIM = 200
HIDDEN_DIM = 500
ENC_DIM = 200
TARGET_DIM = 3 if model['aspect'] in set(['all', 'short']) else 1
if model['mode'] == 'sents':
trainer = ChunkTrainer(EMBD_DIM, HIDDEN_DIM, KERNEL_DIM, ENC_DIM, TARGET_DIM)
else:
trainer = ChunkTrainerRel(EMBD_DIM, HIDDEN_DIM, KERNEL_DIM, ENC_DIM, TARGET_DIM)
trainer.load_state_dict(model['model'])
trainer.activation = nn.Sigmoid()
trainer.reg = model['reg']
trainer.reg_mean = model['reg_mean']
rat_path = '/Users/Max/data/beer_reviews/annotations.json'
evalset = EvalSet(rat_path, vocab)
In [80]:
# Plot a table
print('__________________________Training Table__________________________')
for k, v in model['train_loss'].items():
epoch, loss, pred_loss, reg_loss = k, v[0], model['train_pred_loss'][k][0], model['train_reg_loss'][k][0]
print(str.join(" | ", ['Epoch: %d' % (epoch), 'Loss: %.5f' % (loss),
'Pred Loss: %.5f' % (pred_loss), 'Reg Loss: %.5f' % (reg_loss)]))
In [81]:
from dpp_nets.helper.plotting import plot_floats
# Training Plots
plot_floats(model['train_loss'], xlabel='Epochs', ylabel='MSE + Reg', title='Training MSE + Reg')
plot_floats(model['train_pred_loss'], xlabel='Epochs', ylabel='MSE', title='Training MSE')
plot_floats(model['train_reg_loss'], xlabel='Epochs', ylabel='Reg', title='Training Reg')
In [82]:
print('_________________________Validation Table_________________________')
for k, v in model['val_loss'].items():
epoch, loss, pred_loss, reg_loss = k, v[0], model['val_pred_loss'][k][0], model['val_reg_loss'][k][0]
print(str.join(" | ", ['Epoch: %d' % (epoch), 'Loss: %.5f' % (loss),
'Pred Loss: %.5f' % (pred_loss), 'Reg Loss: %.5f' % (reg_loss)]))
In [83]:
from dpp_nets.helper.plotting import plot_floats
# Training Plots
plot_floats(model['val_loss'], xlabel='Epochs', ylabel='MSE + Reg', title='Validation MSE + Reg')
plot_floats(model['val_pred_loss'], xlabel='Epochs', ylabel='MSE', title='Validation MSE')
plot_floats(model['val_reg_loss'], xlabel='Epochs', ylabel='Reg', title='Validation Reg')
In [84]:
# Evaluation on Test Set
loss, pred_loss, reg_loss = evalset.computeLoss(trainer, model['mode'])
print(str.join(" | ", ['Test Set:', 'Loss: %.5f' % (loss),
'Pred Loss: %.5f' % (pred_loss), 'Reg Loss: %.5f' % (reg_loss)]))
prec, extract = evalset.evaluatePrecision(trainer,model['mode'])
print(str.join(" | ", ['Test Set:', 'Precision: %.5f' % (prec), 'Extract: %.5f' % (extract)]))
In [85]:
# Random Samples
evalset.sample(trainer, model['mode'],705)
In [86]:
# Random Marginals
evalset.computeMarginals(trainer, model['mode'],100)
In [87]:
evalset.computeMAPPredLoss(trainer, model['mode'])
Out[87]:
In [13]:
evalset.computeMUEPredLoss(trainer, model['mode'])
In [ ]:
evalset.create_MAPS(trainer, model['mode'])
In [ ]:
good = []
bad = []
for m, t in evalset.MAPS:
if t > 0.9:
for tup in m:
for word in tup:
good.append(word)
if t < 0.45:
for tup in m:
for word in tup:
bad.append(word)
In [ ]:
l = []
for m, t in evalset.MAPS:
for tup in m:
for word in tup:
l.append(word)
In [ ]:
from collections import Counter
import operator
all_revs = Counter(l)
good_revs = Counter(good)
bad_revs = Counter(bad)
sorted_good = sorted(good_revs.items(), key=operator.itemgetter(1))
sorted_good.reverse()
sorted_bad = sorted(bad_revs.items(), key=operator.itemgetter(1))
sorted_bad.reverse()
sorted_all = sorted(all_revs.items(), key=operator.itemgetter(1))
sorted_all.reverse()
In [ ]:
n_words = 20
for i in range(n_words):
word, count = sorted_good[i]
print(word, count)
In [ ]:
from dpp_nets.utils.language import Vocabulary, BeerDataset, simple_collate, custom_collate_reinforce, custom_collate
val_path = '/Users/Max/data/beer_reviews/' + 'reviews.' + model['aspect'] + '.heldout.' + model['mode'] + '.txt.gz'
val_set = BeerDataset(val_path)
val_loader = torch.utils.data.DataLoader(val_set, collate_fn=simple_collate, batch_size=1)
def compute_word_clouds(trainer, good_thres, bad_thres):
words, target = custom_collate_reinforce(batch, vocab, args.alpha_iter, args.cuda)
target_mean = target.data.mean()
# Compute MAP
kernel, _ = trainer.kernel_net(words)
L = (kernel.data.mm(kernel.data.t())).numpy()
return_ixs = computeMAP(L)
MAP = []
for i in return_ixs:
txt = list(review.clean.keys())[i]
label = list(review.clean.values())[i]
rat = review.rev[txt]
MAP.extend(rat)
good = []
bad = []
In [ ]:
for batch in val_loader:
print(batch)
In [ ]:
review, target = batch[0]['review'], batch[0]['target']
In [ ]:
custom_collate(batch, evalset.vocab)
In [ ]:
# Count sizes - no tensor operations
# Map to Embeddings
rep = vocab.returnEmbds(d['review'])
rep
In [ ]:
evalset.words[0][2]
In [ ]:
torch.mean(torch.var(torch.stack(evalset.targets),0))
In [52]:
import numpy as np
sparsity = np.array([0.04, 0.08, 0.11, 0.19, 0.24, 0.3, 0.5, 1])
performance = np.array([0.014, 0.0136, 0.0127, 0.0111, 0.01123 , 0.0105, 0.0095, 0.0092])
SVMy = [0.015, 0.015]
SVMx = [0, 1]
constantx = [0, 1]
constanty = [0.0165, 0.0165]
In [53]:
import matplotlib.pyplot as plt
plt.plot(sparsity, performance, marker='o', color='g', label='DPP Net (MUE Prediction)')
plt.plot(SVMx, SVMy, label='SVM')
plt.plot(constantx, constanty, label='Constant', color='b')
plt.ylabel('MSE')
plt.xlabel('% of extracted text')
plt.xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0], ['0%', '20%', '40%','60%','80%','100%'])
plt.title('Sparsity vs Performance')
plt.legend()
plt.savefig('sparsityvsperformance')
plt.show()
In [77]:
evalset.reviews[705]
Out[77]:
In [5]:
evalset.reviews[713]
Out[5]:
In [ ]: