How to run this notebook:
./generate.sh -t -l. I downloaded 1 mill. train samples with p = 1.0. I chose to copy the first 10k lines of data into a "sample" directory so I could quickly play around with the data, and then use the full dataset when done.
Random facts from paper:
In [7]:
import numpy as np
import os.path
import pdb
import pandas as pd
from pprint import pprint
#DATA_DIR = '/home/brandon/terabyte/Datasets/ubuntu_dialogue_corpus/'
DATA_DIR = '/home/brandon/ubuntu_dialogue_corpus/src/' # sample/'
TRAIN_PATH = DATA_DIR + 'train.csv'
VALID_PATH = DATA_DIR + 'valid.csv'
TEST_PATH = DATA_DIR + 'test.csv'
def get_training():
"""Returns dataframe data from train.csv """
# First, we need to load the data directly into a dataframe from the train.csv file.
df_train = pd.read_csv(TRAIN_PATH)
# Remove all examples with label = 0. (why would i want to train on false examples?)
df_train = df_train.loc[df_train['Label'] == 1.0]
# Don't care about the pandas indices in the df, so remove them.
df_train = df_train.reset_index(drop=True)
df_train = df_train[df_train.columns[:2]]
return df_train
def get_validation():
"""Returns data from valid.csv """
# First, we need to load the data directly into a dataframe from the train.csv file.
df_valid = pd.read_csv(VALID_PATH)
first_two_cols = df_valid.columns[:2]
df_valid = df_valid[first_two_cols]
df_valid.columns = ['Context', 'Utterance']
return df_valid
df_train = get_training()
df_valid = get_validation()
In [8]:
# Now get all of the data in a single string and make a 'vocabulary' (unique words).
import nltk, re, pprint
from nltk import word_tokenize
import pdb
def print_single_turn(turn: str):
as_list_of_utters = turn.split('__eou__')[:-1]
for idx_utter, utter in enumerate(as_list_of_utters):
print("\t>>>", utter)
def print_conversation(df, index=0):
"""Display the ith conversation in nice format."""
# Get the row identified by 'index'.
context_entry = df['Context'].values[index]
target = df['Utterance'].values[index]
# Split returns a blank last entry, so don't store.
turns = context_entry.split('__eot__')[:-1]
print('--------------------- CONTEXT ------------------- ')
for idx_turn, turn in enumerate(turns):
print("\nUser {}: ".format(idx_turn % 2))
print_single_turn(turn)
print('\n--------------------- RESPONSE ------------------- ')
print("\nUser {}: ".format(len(turns) % 2))
print_single_turn(target)
def get_user_arrays(df):
"""Returns two arrays of every other turn.
Specifically:
len(returned array) is number of rows in df. I SURE HOPE NOT!
each entry is a numpy array.
each numpy array contains utterances as entries.
"""
userOne = []
userTwo = []
contexts = df['Context'].values
targets = df['Utterance'].values
assert(len(contexts) == len(targets))
for i in range(len(contexts)):
# combined SINGLE CONVERSATION ENTRY of multiple turns each with multiple utterances.
list_of_turns = contexts[i].lower().split('__eot__')[:-1] + [targets[i].lower()]
# make sure even number of entries
if len(list_of_turns) % 2 != 0:
list_of_turns = list_of_turns[:-1]
# strip out the __eou__ occurences (leading space bc otherwise would result in two spaces)
new_list_of_turns = []
for turn in list_of_turns:
utter_list = turn.lower().split(" __eou__")
#if len(utter_list) > 3:
# utter_list = utter_list[:3]
new_list_of_turns.append("".join(utter_list))
#list_of_turns = [re.sub(' __eou__', '', t) for t in list_of_turns]
userOneThisConvo = new_list_of_turns[0::2]
userTwoThisConvo = new_list_of_turns[1::2]
userOne += userOneThisConvo
userTwo += userTwoThisConvo
assert(len(userOne) == len(userTwo))
return userOne, userTwo
def save_to_file(fname, arr):
with open(DATA_DIR+fname,"w") as f:
for line in arr:
f.write(line + "\n")
In [4]:
df_train.describe()
Out[4]:
In [5]:
pd.options.display.max_colwidth = 500
df_train.head(2)
Out[5]:
In [6]:
print_conversation(df_train, 3)
In [10]:
#df_merged = pd.DataFrame(df_train['Context'].map(str) + df_train['Utterance'])
userOne, userTwo = get_user_arrays(df_train)
df_turns = pd.DataFrame({'UserOne': userOne, 'UserTwo': userTwo})
df_turns.head(200)
Out[10]:
In [5]:
userOne[0]
Out[5]:
In [6]:
def get_sentences(userOne, userTwo):
encoder = []
decoder = []
assert(len(userOne) == len(userTwo))
for i in range(len(userOne)):
one = nltk.sent_tokenize(userOne[i])
one = [s for s in one if s != '.']
two = nltk.sent_tokenize(userTwo[i])
two = [s for s in two if s != '.']
combine = one + two
assert(len(combine) == len(one) + len(two))
if len(combine) % 2 != 0:
combine = combine[:-1]
enc = combine[0::2]
dec = combine[1::2]
assert(len(enc) == len(dec))
encoder.append(enc)
decoder.append(dec)
return encoder, decoder
encoder, decoder = get_sentences(userOne, userTwo)
print('done')
In [ ]:
encoder = [nltk.word_tokenize(s[0]) for s in encoder]
decoder = [nltk.word_tokenize(s[0]) for s in decoder]
In [ ]:
max_enc_len = max([len(s) for s in encoder])
max_dec_len = max([len(s) for s in decoder])
print(max_enc_len)
print(max_dec_len)
In [ ]:
encoder_lengths = [len(s) for s in encoder]
decoder_lengths = [len(s) for s in decoder]
df_lengths = pd.DataFrame({'EncoderSentLength': encoder_lengths, 'DecoderSentLengths': decoder_lengths})
df_lengths.describe()
In [ ]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = 9, 5
fig, axes = plt.subplots(nrows=1, ncols=2)
plt.subplot(1, 2, 1)
plt.hist(encoder_lengths)
plt.subplot(1, 2, 2)
plt.hist(decoder_lengths, color='b')
plt.tight_layout()
plt.show()
In [11]:
save_to_file("train_from.txt", userOne)
save_to_file("train_to.txt", userTwo)
In [12]:
print("df_valid has", len(df_valid), "rows.")
df_valid.head()
Out[12]:
In [13]:
userOne, userTwo = get_user_arrays(df_valid)
save_to_file("valid_from.txt", userOne)
save_to_file("valid_to.txt", userTwo)
In [14]:
print('done')
In [18]:
import matplotlib.pyplot as plt
%matplotlib inline
userOne, userTwo = get_user_arrays(df_train)
# Regular expressions used to tokenize.
_WORD_SPLIT = re.compile(b"([.,!?\"':;)(])")
_DIGIT_RE = re.compile(br"\d")
lengths = np.array([len(t.strip().split()) for t in userOne])
max_ind = lengths.argmax()
print(max(lengths), "at", max_ind)
print("Sentence:\n", userOne[max_ind])
In [19]:
import matplotlib.pyplot as plt
plt.hist(sorted(lengths)[:-20])
Out[19]:
In [20]:
n_under_20 = sum([1 if l < 100 else 0 for l in lengths])
print(n_under_20, "out of", len(lengths), "({}\%)".format(float(n_under_20)/len(lengths)))
In [21]:
df_lengths = pd.DataFrame(lengths)
In [22]:
df_lengths.describe()
Out[22]:
In [3]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# Number of gradient descent steps, each over a batch_size amount of data.
vocab_size = 40000
# Uniform chance of guessing any word.
loss_random_guess = np.log(float(vocab_size))
print("Loss for uniformly random guessing is", loss_random_guess)
sent_length = [5, 10, 25]
# Outputs correct target x percent of the time.
pred_accuracy = np.arange(100)
plt.plot(pred_accuracy, [1./p for p in pred_accuracy])
Out[3]:
In [27]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = 10, 8
def _sample(logits, t):
res = logits / t
res = np.exp(res) / np.sum(np.exp(res))
return res
N = 100
x = np.arange(N)
before = np.array([1.0+i**2 for i in range(N)])
before /= before.sum()
plt.plot(x, before, 'b--', label='before')
after = _sample(before, 0.1)
plt.plot(x, after, 'g--', label='temp=0.01')
after = _sample(before, 0.2)
print(after.argmax())
plt.plot(x, after, 'r--', label='temp=0.001')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
Out[27]:
In [5]:
np.info(plt.plot)
In [ ]: