In [1]:
# See how the old switchboard trainin, test and heldout data are different to the current one
# to see if that accounts for the drop in performance.
In [1]:
swbd_old = "/media/dsg-labuser/NO_NAME/IS_15_swbd_data/data/switchboard/swbd_train_data.csv"
swbd_new = "../disfluency_detection/switchboard/swbd_disf_train_1_data.csv"
In [17]:
# the old methods
import random
import numpy as np
def shuffle_old(lol, seed):
'''
lol :: list of list as input
seed :: seed the shuffling
shuffle inplace each list in the same order
'''
for l in lol:
random.seed(seed)
random.shuffle(l)
def minibatch_old(l, bs):
'''
l :: list of word idxs
return a list of minibatches of indexes
which size is equal to bs
border cases are treated as follow:
eg: [0,1,2,3] and bs = 3
will output:
[[0],[0,1],[0,1,2],[1,2,3]]
'''
out = [l[:i] for i in xrange(1, min(bs,len(l)+1) )]
out += [l[i-bs:i] for i in xrange(bs,len(l)+1) ]
assert len(l) == len(out)
return out
def indicesFromLength_old(sentenceLength,bs,totalSize):
'''
return a list of indexes pairs (start/stop) for each word
max difference between start and stop equal to bs
border cases are treated as follow:
eg: sentenceLength=4 and bs = 3
will output:
[[0,0],[0,1],[0,2],[1,3]]
'''
l = map(lambda x: totalSize+x ,\
xrange(sentenceLength))
out = []
for i in xrange(0, min(bs,len(l)) ):
out.append([l[0],l[i]])
for i in xrange(bs+1,len(l)+1):
out.append([l[i-bs],l[i-1]])
assert len(l) == sentenceLength
return out
def contextwin_old(l, win):
'''
win :: int corresponding to the size of the window
given a list of indexes composing a sentence
it will return a list of list of indexes corresponding
to context windows surrounding each word in the sentence
'''
assert (win % 2) == 1
assert win >=1
l = list(l)
lpadded = win/2 * [-1] + l + win/2 * [-1]
out = [ lpadded[i:i+win] for i in range(len(l)) ]
assert len(out) == len(l)
return out
def contextwinbackwards_old(l, win):
'''
Same as contextwin except only backwards context (i.e. like an n-gram model)
'''
#assert (win % 2) == 1
assert win >=1
l = list(l)
lpadded = (win-1) * [-1] + l
out = [ lpadded[i:i+win] for i in range(len(l)) ]
assert len(out) == len(l)
return out
def corpusToIndexedMatrix_old(my_array_list, win, bs):
'''
Returns a matrix of contextwins for a list of utterances of dimensions win * n_words_in_corpus (i.e. total length of all arrays in my_array_list)
and corresponding matrix of indexes (of just start/stop for each one) so 2 * n_words_in_corpus
of where to access these, using bs (backprop distance) as the limiting history size
'''
sentences = [] # a list (of arrays, or lists?), returned as matrix
indices = [] #a list of index pairs (arrays?), returned as matrix
totalSize = 0
for sentence in my_array_list:
#print totalSize
#print sentence
cwords = contextwinbackwards_old(sentence, win) #get list of context windows
cindices = indicesFromLength_old(len(cwords),bs,totalSize)
indices.extend(cindices)
sentences.extend(cwords)
totalSize+=len(cwords)
return np.matrix(sentences, dtype='int32'), indices
import gzip
import cPickle
import urllib
import logging
import os
import numpy as np
from collections import defaultdict
from os.path import isfile
logger = logging.getLogger(__name__)
PREFIX = os.getenv('ATISDATA', '')
SWITCHBOARDPREFIX = '/media/dsg-labuser/NO_NAME/IS_15_swbd_data/data/switchboard/'
def switchboardfold_old(fold=None, rpMid=False):
if not fold is None:
assert fold in range(10)
ftrain = open(SWITCHBOARDPREFIX + 'FOLD'+str(fold)+'.csv.text')
else:
ftrain = open(SWITCHBOARDPREFIX +'swbd_train_data.csv')
fval = open(SWITCHBOARDPREFIX + 'swbd_heldout_data.csv')
ftest = open(SWITCHBOARDPREFIX + 'swbd_test_data.csv')
fval2 = open(SWITCHBOARDPREFIX + 'swbd_heldout_data.csv') #dummy
ftest2 = open(SWITCHBOARDPREFIX + 'swbd_test_data.csv') #dummy
dict = defaultdict()
dict['words2idx'] = load_word_rep_old(SWITCHBOARDPREFIX +'swbd_word_rep.csv')
dict['pos2idx'] = load_word_rep_old(SWITCHBOARDPREFIX + 'swbd_pos_rep.csv')
if rpMid == True:
dict['labels2idx'] = load_tags_old(SWITCHBOARDPREFIX +'swbd_tags_rpmid.csv')
else:
dict['labels2idx'] = load_tags_old(SWITCHBOARDPREFIX +'swbd_tags.csv')
#also have a traindict which has only the tags it can be trained on
train_dict = defaultdict()
if rpMid == True:
train_dict['labels2idx'] = load_tags_old(SWITCHBOARDPREFIX +'swbd_train_tags_rpmid.csv')
else:
train_dict['labels2idx'] = load_tags_old(SWITCHBOARDPREFIX +'swbd_train_tags.csv')
l = load_data_from_file_old(ftrain, dict['words2idx'], dict['pos2idx'], train_dict['labels2idx'], rpMid=rpMid)
l1 = load_data_from_file_old(fval, dict['words2idx'], dict['pos2idx'], train_dict['labels2idx'], rpMid=rpMid)
l2 = load_data_from_file_old(ftest, dict['words2idx'], dict['pos2idx'], train_dict['labels2idx'], rpMid=rpMid)
l1_all = load_data_from_file_old(fval2, dict['words2idx'], dict['pos2idx'], dict['labels2idx'], rpMid=rpMid) #val set with all tags
l2_all = load_data_from_file_old(ftest2, dict['words2idx'], dict['pos2idx'], dict['labels2idx'], rpMid=rpMid) #test set with all tags
return l,l1,l2,l1_all,l2_all,dict,train_dict
def load_word_rep_old(filepath, dimension=None, word_rep_type="one_hot"):
"""Returns a word_rep_dictionary from word(string) indicating an index by an integer"""
word_rep_dictionary = None
if word_rep_type == "one_hot":
word_rep_dictionary = defaultdict(int) #TODO could use sparse matrices instead?
f = open(filepath)
for line in f:
l = line.split(",")
word_rep_dictionary[l[0]] = int(l[1])
f.close()
elif word_rep_type == "word_freq_count":
raise NotImplementedError()
elif word_rep_type == "neural_word":
raise NotImplementedError()
return word_rep_dictionary
def load_tags_old(filepath):
"""Returns a tag dictionary from word to a n int indicating index by an integer"""
tag_dictionary = defaultdict(int) #TODO could use sparse matrices instead?
f = open(filepath)
for line in f:
l = line.strip('\n').split(",")
tag_dictionary[l[0]] = int(l[1])
f.close()
return tag_dictionary
def load_data_from_file_old(f, word_rep, pos_rep, tags, rpMid=False, n_seq=None):
"""Loads into a two lists of arrays, one for words (seq), one for tags (targets), both equal length."""
print "loading training data"
#f = open(filepath)
count_seq = 0
count_step = 0
seq = []
pos_seq = []
targets = []
currentUtt = []
currentPOS = []
currentTags = []
for line in f:
l = line.rstrip("\r\n") # should be sequence_number(at first one of sequence) + word + tag
l = l.split('\t')
if not l[0] == "" and not currentUtt == []: #new utterance
count_seq+=1
x = np.asarray(currentUtt)
p = np.asarray(currentPOS)
y = np.asarray(currentTags)
seq.append(x)
pos_seq.append(p)
targets.append(y)
currentUtt = []
currentPOS = []
currentTags = []
if (not n_seq == None) and count_seq >= n_seq: break
w = word_rep.get(l[1])
pos = pos_rep.get(l[2])
tag = tags.get(str(l[len(l)-1])) # NB POS tags in switchboard at l[2]
if tag == None:
if str(l[len(l)-1]) == "<rpMid/>" and rpMid==False:
tag = tags.get("<f/>")
elif "rpMid" in str(l[len(l)-1]):
tag = tags.get("<rm-8/><rpMid/>")
elif "rpEndSub" in str(l[len(l)-1]):
tag = tags.get("<rm-8/><rpEndSub/>")
elif "rpEndDel" in str(l[len(l)-1]):
tag = tags.get("<rm-8/><rpEndSub/>")
else:
s = "No tag in tag dict:" + str(l[len(l)-1])+"%%%"
raw_input(s)
#print tags
if w == None:
logging.info("No word rep for " + l[1])
#print l[1]
w = word_rep.get("<unk>")
if pos == None:
logging.info("No pos rep for " + l[2])
#print l[2]
pos = pos_rep.get("<unk>")
currentUtt.append(w) #one-hot encoding
currentPOS.append(pos) #one-hot encoding of POS
currentTags.append(tag) #one-hot encoding of tag
count_step+=1
#flush
if not currentUtt == []:
count_seq+=1
x = np.asarray(currentUtt)
p = np.asarray(currentPOS)
y = np.asarray(currentTags)
seq.append(x)
pos_seq.append(p)
targets.append(y)
assert len(seq) == len(targets) == len(pos_seq)
#raw_input()
print "loaded " + str(len(seq)) + " sequences"
f.close()
return (seq,pos_seq,targets)
In [20]:
def load_old():
theano.config.optimizer='None' #speeds things up marginally
# load the dataset
train_set, valid_set, test_set, valid_set_alltags, test_set_alltags, dic, train_dict = \
switchboardfold_old(fold=None, rpMid=False)
#adding train_dict as not all tags available in testing
#will not punish system for getting these wrong.
print str(len(train_dict['labels2idx'].items())) + " training classes"
print str(len(dic['labels2idx'].items())) + " testing classes"
print str(len(dic['words2idx'].items())) + " words in vocab"
if not dic.get('pos2idx') == None:
print str(len(dic['pos2idx'].items())) + " pos tags in vocab"
idx2label_train = dict((k,v) for v,k in train_dict['labels2idx'].iteritems()) # first half (28) the same as the test
idx2label = dict((k,v) for v,k in dic['labels2idx'].iteritems())
idx2word = dict((k,v) for v,k in dic['words2idx'].iteritems())
if not dic.get('pos2idx') == None:
idx2pos = dict((k,v) for v,k in dic['pos2idx'].iteritems())
#Now including pos tags
train_lex, train_pos, train_y = train_set
valid_lex, valid_pos, valid_y = valid_set
test_lex, test_pos, test_y = test_set
#sets with ALL tags, i.e. those not in training:
valid_y_alltags = valid_set_alltags[-1] #always the last one
test_y_alltags = test_set_alltags[-1] #always the last one
vocsize = len(dic['words2idx'].items())
#nclasses = len(dic['labels2idx'].items()) # actually smaller in reality, i.e. the below
nclasses = len(train_dict['labels2idx'].items())
nsentences = len(train_lex)
possize = None
if not dic.get('pos2idx') == None:
possize = len(idx2pos.items())
nwords = len(list(itertools.chain(*train_y))) # TODO have added this
print str(nsentences) + " training sequences"
print "instantiating model"
# instantiate the model
# TODO shuffle_old([train_lex,train_pos,train_y], s['seed']) #shuffle training data
s = {'win' : 2, 'bs' : 9}
# The new code trying to use theano more: converting into matrices with indices
mycorpus, myb_indices = corpusToIndexedMatrix_old(train_lex, s['win'], s['bs']) #window size across number of words deep, gets matrix too
mypos = corpusToIndexedMatrix_old(train_pos, s['win'], s['bs'])[0] # first column is the actual POS windows (which are indices to one hot vectors)
mylabels = list(itertools.chain(*train_y))
mylabels = numpy.asarray(mylabels, dtype='int32')
# Now see how they differ...
return mycorpus, mypos, myb_indices, mylabels
In [21]:
my_c, my_pos, my_indices, my_labels = load_old()
In [23]:
import sys
sys.path.append("../../../")
In [31]:
from deep_disfluency.tagger.deep_tagger import DeepDisfluencyTagger
import numpy as np
from deep_disfluency.utils.tools import dialogue_data_and_indices_from_matrix
from deep_disfluency.load.load import load_tags
In [26]:
disf = DeepDisfluencyTagger(
config_file="../../../deep_disfluency/experiments/experiment_configs.csv",
config_number=21,
saved_model_dir="../../../deep_disfluency/experiments/021/epoch_40"
)
In [33]:
validation_dialogues_filepath = "../../data/disfluency_detection/feature_matrices/train"
n_extra = 0
utts_presegmented = True
window_size = 2
bs = 9
tags = "disf1_tags"
tag_to_index_map = load_tags("../../data/tag_representations/swbd_disf1_021_tags.csv")
train_matrices = [np.load(
validation_dialogues_filepath + "/" + fp)
for fp in os.listdir(
validation_dialogues_filepath)]
train_matrices = [dialogue_data_and_indices_from_matrix(
d_matrix,
n_extra,
pre_seg=utts_presegmented,
window_size=window_size,
bs=bs,
tag_rep=tags,
tag_to_idx_map=tag_to_index_map,
in_utterances=utts_presegmented)
for d_matrix in train_matrices
]
In [39]:
train_matrices[0][0]
Out[39]:
In [ ]: