The Pessimistic is a proof of concept for style adaptation. The machine takes a sentence as input and returns a sentence addressing a similar object but conveying a negative sentiment.
The machine needs to be fed with the latent representations of the sentences from the dataset which can be obtained using the script compute_latent_representations.py.
In [141]:
import pandas as pd
import numpy as np
import time
import datetime
import json
from tqdm import tqdm
import os
import tensorflow as tf
import seaborn as sns
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from bokeh.io import output_notebook
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool
output_notebook()
from data_utils_LMR import prepare_data,read_data, EncoderDecoder
from model import Vrae as Vrae_model
from batch import Generator
prepare_data(1000)
training_dir = 'logs/'
training_dir += 'no_char2word'
# sentiment analyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentimentAnalyzer = SentimentIntensityAnalyzer()
def getSentimentScore(sentence):
scores = sentimentAnalyzer.polarity_scores(sentence)
return (scores['neg'], scores['neu'] ,scores['pos'])
class dotdict(dict):
"""dot.notation access to dictionary attributes"""
__getattr__ = dict.get
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
def string2bool(st):
if st.lower() == "true":
return True
else:
return False
with open(training_dir +'/flags.json', 'r') as fp:
FLAGS = dotdict(json.loads( fp.read() ) )
for k,v in FLAGS.iteritems():
print k,':',v
n_samples = 5000#int(FLAGS.batch_size)
In [142]:
labels = []
zs = []
with tf.gfile.GFile(training_dir + "/latent_representations.txt" , mode="r") as source_file:
source = source_file.readline()
counter = 0
while source:
source = source_file.readline()
if len(source.split('|')) > 1:
z_ = [ float(u) for u in source.split('|')[1].split(',')]
if len(z_) == 16:
labels.append(source.split('|')[0])
zs.append(z_ )
counter += 1
print len(zs), 'points'
In [143]:
from sklearn.neighbors import KDTree
kdt = KDTree(np.array(zs), leaf_size=1,metric='euclidean')
def getNeighbor(zz, n_similar = 5):
"""
take a z value and returns the neighrest neighbor in the latent space from the training set
"""
dist, ind = kdt.query( zz, k=n_similar)
return [ labels[k] for k in list(ind[0]) ], dist
In [144]:
with open(training_dir +'/training_parameters.json', 'r') as fp:
training_parameters = json.loads( fp.read() )
# vocabulary encoder-decoder
encoderDecoder = EncoderDecoder()
num_symbols = encoderDecoder.vocabularySize()
# prepare data
sentences, ratings = read_data( max_size=None,
max_sentence_size=training_parameters['seq_max'],
min_sentence_size=int(FLAGS.sequence_min),
test=False)
print len(sentences), " sentences"
encoderDecoder = EncoderDecoder()
config = tf.ConfigProto(
device_count = {'GPU': 0}, # do not use GPU for testing
)
FLAGS.peephole = False
# load model
vrae_model = Vrae_model(char2word_state_size = int(FLAGS.char2word_state_size),
char2word_num_layers = int(FLAGS.char2word_num_layers),
encoder_state_size = int(FLAGS.encoder_state_size),
encoder_num_layers = int(FLAGS.encoder_num_layers),
decoder_state_size = int(FLAGS.decoder_state_size),
decoder_num_layers = int(FLAGS.decoder_num_layers),
latent_dim=int(FLAGS.latent_dim),
batch_size=n_samples,
num_symbols=num_symbols,
latent_loss_weight=float(FLAGS.latent_loss_weight),
dtype_precision=FLAGS.dtype_precision,
cell_type=FLAGS.cell,
peephole=FLAGS.peephole,
input_keep_prob=float(FLAGS.input_keep_prob),
output_keep_prob=float(FLAGS.output_keep_prob),
sentiment_feature = string2bool(FLAGS.use_sentiment_feature),
use_char2word = string2bool(FLAGS.use_char2word)
)
def zToXdecoded(session,z_sample,s_length):
x_reconstruct = vrae_model.zToX(session,z_sample,s_length)
return encoderDecoder.prettyDecode( np.argmax(x_reconstruct[0], axis= 1) )
In [153]:
def MachineSays(sess,u,n_sample = 20):
#print train_dir
sent = getSentimentScore(u)
sent_index = 0 # grumpy
zz = vrae_model.XToz(sess, *encoderDecoder.encodeForTraining(u),sentiment=getSentimentScore(u))[0]
res, dist = getNeighbor( [list(zz)] ,n_sample)
if u.lower() in res:
res.remove(u.lower())
out = []
for uu in sorted(zip(res,list(dist[0])), key=lambda x : getSentimentScore(x[0])[sent_index] , reverse=True):
out.append(uu[0])
return out
#print "\n",dist
In [154]:
answers = []
saver = tf.train.Saver()
with tf.Session(config=config) as sess:
saver.restore(sess, "./"+training_dir+'/model.ckp')
for uu in MachineSays(sess,"I like this movie.", 50):
print uu
In [152]:
us = [ "I totally loved it.",
"I was really bad.",
"it was terrible.",
"the acting was mostly good.",
"I liked this movie.",
"it was a nice movie.",
"the story was amazing.",
"it was not bad.",
"The acting was good.",
"The music was good.",
]
answers = []
saver = tf.train.Saver()
with tf.Session(config=config) as sess:
saver.restore(sess, "./"+training_dir+'/model.ckp')
for u in us:
answers.append( MachineSays(sess,u)[0] )
df = pd.DataFrame()
df["input"] = us
df["answer"] = answers
df
Out[152]:
In [122]:
print df.to_latex()
In [ ]:
In [125]:
us = [ "I totally loved it.",
"I was really bad.",
"it was terrible.",
"the acting was mostly good.",
"I liked this movie.",
"it was a nice movie.",
"the story was amazing.",
"it was not bad.",
"The acting was good.",
"The music was good.",
]
saver = tf.train.Saver()
df = pd.DataFrame()
with tf.Session(config=config) as sess:
saver.restore(sess, "./"+training_dir+'/model.ckp')
#for _ in range(20):
for u in us:
k = int(np.random.random() * len(sentences))
u = encoderDecoder.prettyDecode(sentences[k])
l = MachineSays(sess,u,20)
while len(l) < 20:
l.append("")
df[u] = l
df
Out[125]:
In [120]:
print df.to_latex()
In [ ]: