In [1]:
# See how the old switchboard trainin, test and heldout data are different to the current one
# to see if that accounts for the drop in performance.

In [1]:
swbd_old = "/media/dsg-labuser/NO_NAME/IS_15_swbd_data/data/switchboard/swbd_train_data.csv"
swbd_new = "../disfluency_detection/switchboard/swbd_disf_train_1_data.csv"

In [17]:
# the old methods
import random
import numpy as np

def shuffle_old(lol, seed):
    '''
    lol :: list of list as input
    seed :: seed the shuffling

    shuffle inplace each list in the same order
    '''
    for l in lol:
        random.seed(seed)
        random.shuffle(l)

def minibatch_old(l, bs):
    '''
    l :: list of word idxs
    return a list of minibatches of indexes
    which size is equal to bs
    border cases are treated as follow:
    eg: [0,1,2,3] and bs = 3
    will output:
    [[0],[0,1],[0,1,2],[1,2,3]]
    '''
    out  = [l[:i] for i in xrange(1, min(bs,len(l)+1) )]
    out += [l[i-bs:i] for i in xrange(bs,len(l)+1) ]
    assert len(l) == len(out)
    return out

def indicesFromLength_old(sentenceLength,bs,totalSize):
    '''
    return a list of indexes pairs (start/stop) for each word
    max difference between start and stop equal to bs
    border cases are treated as follow:
    eg: sentenceLength=4 and bs = 3
    will output:
    [[0,0],[0,1],[0,2],[1,3]]
    '''
    l = map(lambda x: totalSize+x ,\
                xrange(sentenceLength))
    out = []
    for i in xrange(0, min(bs,len(l)) ):
        out.append([l[0],l[i]]) 
    for i in xrange(bs+1,len(l)+1):
        out.append([l[i-bs],l[i-1]])
    assert len(l) == sentenceLength
    return out

def contextwin_old(l, win):
    '''
    win :: int corresponding to the size of the window
    given a list of indexes composing a sentence
    it will return a list of list of indexes corresponding
    to context windows surrounding each word in the sentence
    '''
    assert (win % 2) == 1
    assert win >=1
    l = list(l)

    lpadded = win/2 * [-1] + l + win/2 * [-1]
    out = [ lpadded[i:i+win] for i in range(len(l)) ]

    assert len(out) == len(l)
    return out

def contextwinbackwards_old(l, win):
    '''
    Same as contextwin except only backwards context (i.e. like an n-gram model)
    '''
    #assert (win % 2) == 1
    assert win >=1
    l = list(l)
    lpadded = (win-1) * [-1] + l
    out = [ lpadded[i:i+win] for i in range(len(l)) ]

    assert len(out) == len(l)
    return out

def corpusToIndexedMatrix_old(my_array_list, win, bs):
    '''
    Returns a matrix of contextwins for a list of utterances of dimensions win * n_words_in_corpus (i.e. total length of all arrays in my_array_list)
    and corresponding matrix of indexes (of just start/stop for each one) so 2 * n_words_in_corpus
    of where to access these, using bs (backprop distance) as the limiting history size
    '''
    sentences = [] # a list (of arrays, or lists?), returned as matrix
    indices = [] #a list of index pairs (arrays?), returned as matrix
    totalSize = 0
    for sentence in my_array_list:
        #print totalSize
        #print sentence
        cwords = contextwinbackwards_old(sentence, win) #get list of context windows
        cindices = indicesFromLength_old(len(cwords),bs,totalSize)

        indices.extend(cindices)
        sentences.extend(cwords)
        totalSize+=len(cwords)
    
    return np.matrix(sentences, dtype='int32'), indices


import gzip
import cPickle
import urllib
import logging
import os
import numpy as np
from collections import defaultdict

from os.path import isfile

logger = logging.getLogger(__name__)

PREFIX = os.getenv('ATISDATA', '')
SWITCHBOARDPREFIX = '/media/dsg-labuser/NO_NAME/IS_15_swbd_data/data/switchboard/'

def switchboardfold_old(fold=None, rpMid=False):
    if not fold is None:
        assert fold in range(10)
        ftrain = open(SWITCHBOARDPREFIX + 'FOLD'+str(fold)+'.csv.text')
    else:
        ftrain = open(SWITCHBOARDPREFIX +'swbd_train_data.csv')
        fval = open(SWITCHBOARDPREFIX + 'swbd_heldout_data.csv')
        ftest = open(SWITCHBOARDPREFIX + 'swbd_test_data.csv')
        fval2 = open(SWITCHBOARDPREFIX + 'swbd_heldout_data.csv') #dummy
        ftest2 = open(SWITCHBOARDPREFIX + 'swbd_test_data.csv') #dummy
    dict = defaultdict()
    dict['words2idx'] = load_word_rep_old(SWITCHBOARDPREFIX +'swbd_word_rep.csv')
    dict['pos2idx'] = load_word_rep_old(SWITCHBOARDPREFIX + 'swbd_pos_rep.csv')
    if rpMid == True:
        dict['labels2idx'] = load_tags_old(SWITCHBOARDPREFIX +'swbd_tags_rpmid.csv')
    else:
        dict['labels2idx'] = load_tags_old(SWITCHBOARDPREFIX +'swbd_tags.csv')


    #also have a traindict which has only the tags it can be trained on
    train_dict = defaultdict()
    if rpMid == True:
         train_dict['labels2idx'] = load_tags_old(SWITCHBOARDPREFIX +'swbd_train_tags_rpmid.csv')
    else:
        train_dict['labels2idx'] = load_tags_old(SWITCHBOARDPREFIX +'swbd_train_tags.csv')
    
    l = load_data_from_file_old(ftrain, dict['words2idx'], dict['pos2idx'], train_dict['labels2idx'], rpMid=rpMid)
    l1 = load_data_from_file_old(fval, dict['words2idx'], dict['pos2idx'], train_dict['labels2idx'], rpMid=rpMid)
    l2 = load_data_from_file_old(ftest, dict['words2idx'], dict['pos2idx'], train_dict['labels2idx'], rpMid=rpMid)
    l1_all = load_data_from_file_old(fval2, dict['words2idx'], dict['pos2idx'], dict['labels2idx'], rpMid=rpMid) #val set with all tags
    l2_all = load_data_from_file_old(ftest2, dict['words2idx'], dict['pos2idx'], dict['labels2idx'], rpMid=rpMid) #test set with all tags
    
    return l,l1,l2,l1_all,l2_all,dict,train_dict

def load_word_rep_old(filepath, dimension=None, word_rep_type="one_hot"):
    """Returns a word_rep_dictionary from word(string) indicating an index by an integer"""
    word_rep_dictionary = None
    if word_rep_type == "one_hot":
        word_rep_dictionary = defaultdict(int) #TODO could use sparse matrices instead?
        f = open(filepath)
        for line in f:
            l = line.split(",")
            word_rep_dictionary[l[0]] = int(l[1])
        f.close()
    elif word_rep_type == "word_freq_count":
        raise NotImplementedError()
    elif word_rep_type == "neural_word":
        raise NotImplementedError()
    return word_rep_dictionary

def load_tags_old(filepath):
    """Returns a tag dictionary from word to a n int indicating index by an integer"""
    tag_dictionary = defaultdict(int) #TODO could use sparse matrices instead?
    f = open(filepath)
    for line in f:
        l = line.strip('\n').split(",")
        tag_dictionary[l[0]] = int(l[1])
    f.close()
    return tag_dictionary

def load_data_from_file_old(f, word_rep, pos_rep, tags, rpMid=False, n_seq=None):
    """Loads into a two lists of arrays, one for words (seq), one for tags (targets), both equal length."""
    print "loading training data"
    #f = open(filepath)
    count_seq = 0
    count_step = 0
    seq = []
    pos_seq = []
    targets = []
    currentUtt = []
    currentPOS = []
    currentTags = []
    for line in f:
        l = line.rstrip("\r\n") # should be sequence_number(at first one of sequence) + word + tag
        l = l.split('\t')
        if not l[0] == "" and not currentUtt == []: #new utterance
            count_seq+=1
            x = np.asarray(currentUtt)
            p = np.asarray(currentPOS)
            y = np.asarray(currentTags)
            seq.append(x)
            pos_seq.append(p)
            targets.append(y)
            currentUtt = []
            currentPOS = []
            currentTags = []
        if (not n_seq == None) and count_seq >= n_seq: break
        w = word_rep.get(l[1])
        pos = pos_rep.get(l[2])
        tag = tags.get(str(l[len(l)-1])) # NB POS tags in switchboard at l[2]
        if tag == None:
            if str(l[len(l)-1]) == "<rpMid/>" and rpMid==False:
                tag = tags.get("<f/>")
            elif "rpMid" in str(l[len(l)-1]):
                tag = tags.get("<rm-8/><rpMid/>")
            elif "rpEndSub" in str(l[len(l)-1]):
                tag = tags.get("<rm-8/><rpEndSub/>")
            elif "rpEndDel" in str(l[len(l)-1]):
                tag = tags.get("<rm-8/><rpEndSub/>")
            else:
                s = "No tag in tag dict:" + str(l[len(l)-1])+"%%%"
                raw_input(s)
                
            #print tags
        if w == None:
            logging.info("No word rep for " + l[1])
            #print l[1]
            w = word_rep.get("<unk>")
        if pos == None:
            logging.info("No pos rep for " + l[2])
            #print l[2]
            pos = pos_rep.get("<unk>")
        
        currentUtt.append(w) #one-hot encoding
        currentPOS.append(pos) #one-hot encoding of POS
        currentTags.append(tag) #one-hot encoding of tag
        count_step+=1
        
    #flush
    if not currentUtt == []:
        count_seq+=1
        x = np.asarray(currentUtt)
        p = np.asarray(currentPOS)
        y = np.asarray(currentTags)
        seq.append(x)
        pos_seq.append(p)
        targets.append(y)
    assert len(seq) == len(targets) == len(pos_seq)
    #raw_input()
    print "loaded " + str(len(seq)) + " sequences"
    f.close()
    return (seq,pos_seq,targets)

In [20]:
def load_old():

    theano.config.optimizer='None' #speeds things up marginally
    # load the dataset
    train_set, valid_set, test_set, valid_set_alltags, test_set_alltags, dic, train_dict = \
                                                            switchboardfold_old(fold=None, rpMid=False) 
    #adding train_dict as not all tags available in testing
    #will not punish system for getting these wrong.
    
    print str(len(train_dict['labels2idx'].items())) + " training classes"
    print str(len(dic['labels2idx'].items())) + " testing classes"
    print str(len(dic['words2idx'].items())) + " words in vocab"
    if not dic.get('pos2idx') == None:
        print str(len(dic['pos2idx'].items())) + " pos tags in vocab"

    idx2label_train = dict((k,v) for v,k in train_dict['labels2idx'].iteritems()) # first half (28) the same as the test
    idx2label = dict((k,v) for v,k in dic['labels2idx'].iteritems())
    idx2word  = dict((k,v) for v,k in dic['words2idx'].iteritems())
    if not dic.get('pos2idx') == None:
        idx2pos = dict((k,v) for v,k in dic['pos2idx'].iteritems())

    #Now including pos tags
    train_lex, train_pos, train_y = train_set
    valid_lex, valid_pos, valid_y = valid_set
    test_lex,  test_pos, test_y  = test_set
    
    #sets with ALL tags, i.e. those not in training:
    valid_y_alltags = valid_set_alltags[-1] #always the last one
    test_y_alltags = test_set_alltags[-1] #always the last one
    
    vocsize = len(dic['words2idx'].items())
    #nclasses = len(dic['labels2idx'].items()) # actually smaller in reality, i.e. the below
    nclasses = len(train_dict['labels2idx'].items())
    nsentences = len(train_lex)
    possize = None
    if not dic.get('pos2idx') == None:
        possize = len(idx2pos.items())
    
    nwords = len(list(itertools.chain(*train_y))) # TODO have added this
    
    print str(nsentences) + " training sequences"
    print "instantiating model"
    # instantiate the model
    
    # TODO shuffle_old([train_lex,train_pos,train_y], s['seed']) #shuffle training data
    
    s = {'win' : 2, 'bs' : 9}
    # The new code trying to use theano more: converting into matrices with indices
    mycorpus, myb_indices = corpusToIndexedMatrix_old(train_lex, s['win'], s['bs']) #window size across number of words deep, gets matrix too
    mypos = corpusToIndexedMatrix_old(train_pos, s['win'], s['bs'])[0] # first column is the actual POS windows (which are indices to one hot vectors)
    mylabels = list(itertools.chain(*train_y))
    mylabels = numpy.asarray(mylabels, dtype='int32')
    # Now see how they differ...
    return mycorpus, mypos, myb_indices, mylabels

In [21]:
my_c, my_pos, my_indices, my_labels = load_old()


loading training data
loaded 90509 sequences
loading training data
loaded 5717 sequences
loading training data
loaded 5942 sequences
loading training data
loaded 5717 sequences
loading training data
loaded 5942 sequences
27 training classes
50 testing classes
9070 words in vocab
127 pos tags in vocab
90509 training sequences
instantiating model

In [23]:
import sys
sys.path.append("../../../")

In [31]:
from deep_disfluency.tagger.deep_tagger import DeepDisfluencyTagger
import numpy as np
from deep_disfluency.utils.tools import dialogue_data_and_indices_from_matrix
from deep_disfluency.load.load import load_tags

In [26]:
disf = DeepDisfluencyTagger(
    config_file="../../../deep_disfluency/experiments/experiment_configs.csv",
    config_number=21,
    saved_model_dir="../../../deep_disfluency/experiments/021/epoch_40"
    )


Initializing Tagger
Processing args from config file...
Intializing model from args...
Using the cpu
Warning: not using GPU, might be a bit slow
	Adjust Theano config file ($HOME/.theanorc)
loading tag to index maps...
Initializing model of type elman ...
Loading saved weights from ../../../deep_disfluency/experiments/021/epoch_40
No POS tagger specified,loading default CRF switchboard one
No timer specified, using default switchboard one
Loading decoder...
loading swbd_disf1_021 Markov model

In [33]:
validation_dialogues_filepath = "../../data/disfluency_detection/feature_matrices/train"
n_extra = 0
utts_presegmented = True
window_size = 2
bs = 9
tags = "disf1_tags"
tag_to_index_map = load_tags("../../data/tag_representations/swbd_disf1_021_tags.csv")
train_matrices = [np.load(
                            validation_dialogues_filepath + "/" + fp)
                       for fp in os.listdir(
                        validation_dialogues_filepath)]
train_matrices = [dialogue_data_and_indices_from_matrix(
                                  d_matrix,
                                  n_extra,
                                  pre_seg=utts_presegmented,
                                  window_size=window_size,
                                  bs=bs,
                                  tag_rep=tags,
                                  tag_to_idx_map=tag_to_index_map,
                            in_utterances=utts_presegmented)
                       for d_matrix in train_matrices
                       ]

In [39]:
train_matrices[0][0]


Out[39]:
matrix([[  -1,  708],
        [ 708, 8343],
        [8343, 3346],
        ..., 
        [4363, 3519],
        [3519, 5316],
        [5316,  241]])

In [ ]: