In [69]:
from itertools import chain
import nltk
import pycrfsuite as crf
import os, sys, glob, csv
from collections import namedtuple
import pandas as pd
import numpy as np
from collections import defaultdict as ddict
beep = lambda x: os.system("echo -n '\a';sleep 0.5;" * x)
beep(2)
Out[69]:
In [71]:
# Code given by TAs
def get_utterances_from_file(dialog_csv_file):
"""Returns a list of DialogUtterances from an open file."""
reader = csv.DictReader(dialog_csv_file)
return [_dict_to_dialog_utterance(du_dict) for du_dict in reader]
def get_utterances_from_filename(dialog_csv_filename):
"""Returns a list of DialogUtterances from an unopened filename."""
with open(dialog_csv_filename, "r") as dialog_csv_file:
return get_utterances_from_file(dialog_csv_file)
def get_data(data_dir):
"""Generates lists of utterances from each dialog file.
To get a list of all dialogs call list(get_data(data_dir)).
data_dir - a dir with csv files containing dialogs"""
dialog_filenames = sorted(glob.glob(os.path.join(data_dir, "*.csv")))
for dialog_filename in dialog_filenames:
yield dialog_filename, get_utterances_from_filename(dialog_filename)
DialogUtterance = namedtuple("DialogUtterance", ("act_tag", "speaker", "pos", "text"))
PosTag = namedtuple("PosTag", ("token", "pos"))
def _dict_to_dialog_utterance(du_dict):
"""Private method for converting a dict to a DialogUtterance."""
# Remove anything with
for k, v in du_dict.items():
if len(v.strip()) == 0:
du_dict[k] = None
# Extract tokens and POS tags
if du_dict["pos"]:
du_dict["pos"] = [
PosTag(*token_pos_pair.split("/"))
for token_pos_pair in du_dict["pos"].split()]
return DialogUtterance(**du_dict)
In [73]:
class CrfClassifier(object):
def __init__(self, max_iters=50, l1_reg=1.0, l2_reg=1e-3):
self.trainer = crf.Trainer(verbose=False)
self.trainer.set_params({
'c1': l1_reg, # coefficient for L1 penalty
'c2': l2_reg, # coefficient for L2 penalty
'max_iterations': max_iters, # stop earlier
# include transitions that are possible, but not observed
'feature.possible_transitions': True
})
def featurize(self, idx, dialog):
feats = []
if idx == 0:
feats.append("BOD") # beginning of dialogue
else:
if dialog[idx].speaker != dialog[idx-1].speaker:
feats.append('SPKRCNG') # speaker Change
utterance = dialog[idx]
if utterance.pos is not None:
for i, pos in enumerate(utterance.pos):
feats.append("TOKEN_%s" % (pos.token))
feats.append("POS[%d]=%s" % (i+1, pos.pos))
else:
tokens = utterance.text.replace('>', '').replace('<', '').replace('.', '').split()
for i, tok in enumerate(tokens):
feats.append("ACTN[%d]=%s" % (i+1, tok))
return feats
def transform(self, dialog):
X, Y = [], []
for idx, utterance in enumerate(dialog):
Y.append(utterance.act_tag)
X.append(self.featurize(idx, dialog))
return X, Y
def train(self, train_dir, model_path):
''' '''
'''
Termianlogy:
directory has dialogues (in each file)
dialogues have utterances (in each line)
uttrances have label, speaker, tokens and the text
'''
dialogs = get_data(train_dir)
for f_name, dialog in dialogs:
X, Y = self.transform(dialog)
self.trainer.append(X, Y)
print("Training and saving model to %s" % model_path)
self.trainer.train(model_path)
print("Done")
def load_model(self, model_file):
self.tagger = crf.Tagger()
self.tagger.open(model_file)
def test(self, test_dir):
dialogs = get_data(test_dir)
for f_name, dialog in dialogs:
X, Y = self.transform(dialog)
predY = self.tagger.tag(X)
assert len(Y) == len(predY)
for i in range(len(Y)):
yield(Y[i], predY[i])
def evaluate(self, dev_dir):
print("Evaluating %s" % dev_dir)
recs = self.test(dev_dir)
matrix = ddict(lambda: ddict(int))
for actual, predicted in recs:
matrix[actual][predicted] += 1
trace = 0
for cls in matrix.keys():
trace += matrix[cls][cls]
tot = 0
for d in matrix.values():
tot += sum(d.values())
return trace / tot, matrix
def predict(self, data_dir, out_file):
dialogs = get_data(data_dir)
with open(out_file, 'w') as out:
for f_name, dialog in dialogs:
out.write('Filename="%s"\n' % f_name.split("/")[-1])
X, _ = self.transform(dialog)
predY = self.tagger.tag(X)
assert len(predY) == len(X)
out.write("\n".join(predY))
out.write("\n\n")
print("Output stored at %s" % out_file)
def train_predict(self, train_dir, data_dir, out_file, model_file="crf_model.data"):
self.train(train_dir, model_file)
self.load_model(model_file)
self.predict(data_dir, out_file)
train_dir = "../data/train"
dev_dir = "../data/dev"
output_file = "output1.txt"
CrfClassifier().train_predict(train_dir, dev_dir, output_file)
In [62]:
!head output1.txt
In [ ]:
In [85]:
In [59]:
%%time
class AdvancedCRF(CrfClassifier):
def featurize(self, idx, dialog):
feats = super(AdvancedCRF, self).featurize(idx, dialog)
# more features here
#feats.append("_bias_")
if idx == len(dialog) - 1:
feats.append("EOD")
pass
if idx == len(dialog) - 2:
feats.append("EOD[-1]")
pass
utter = dialog[idx]
if utter.pos is not None:
for i in range(len(utter.pos)):
feats.append("TOKEN[%d]=%s" % (i, utter.pos[i].token))
feats.append("POS[%d]=%s" % (i, utter.pos[i].pos))
for i in range(len(utter.pos) - 1):
feats.append("TOKEN_%s|TOKEN_%s" % (utter.pos[i].token, utter.pos[i+1].token))
feats.append("POS_%s|POS_%s" % (utter.pos[i].pos, utter.pos[i+1].pos))
if idx > 0:
prev_feats = super(AdvancedCRF, self).featurize(idx-1, dialog)
prev_feats = filter(lambda x: 'POS' in x, prev_feats)
#feats.extend(list(map(lambda x: "PREV_%s" % x, prev_feats)))
if idx < len(dialog) - 1:
#next_feats = super(AdvancedCRF, self).featurize(idx+1, dialog)
#feats.extend(list(map(lambda x: "NEXT_%s" % x, next_feats)))
# add next
pass
return feats
AdvancedCRF(max_iters=50).train_predict(train_dir, dev_dir, output_file)
In [68]:
def evaluate_output(dev_dir, out_file):
'''evaluates the output'''
index = {}
for fn, dialog in get_data(dev_dir):
index[fn.split('/')[-1]] = [utter.act_tag for utter in dialog]
matrix = ddict(lambda: ddict(int)) # confusion matrix
with open(out_file) as inp:
c = 0
fn = None
for line in map(lambda x: x.strip(), inp):
if line.startswith('Filename='):
fn = line.split('=')[1].replace('"','')
continue
if not line: # end of file
c = 0
continue
pred = line
truth = index[fn][c]
matrix[truth][pred] += 1
c += 1
tot = sum(map(lambda x: sum(x.values()), matrix.values()))
trace = sum(map(lambda x: matrix[x][x], matrix.keys()))
return trace/tot, matrix
perf, dod = evaluate_output(dev_dir, output_file)
print(perf)
pd.DataFrame(dod)
Out[68]:
In [ ]: