In [221]:
testEntities_crf='consolidated2/consolidated/dev+test+context.new.crf.p'
trainEntities_crf='consolidated2/consolidated/train+context.new.crf.p'
testEntities_ent='consolidated/dev+test+context.5.p'
trainEntities_ent='consolidated/train+context.5.p'
train_articles_crf, train_titles_crf, train_identifiers_crf, train_downloaded_articles_crf, \
TRAIN_ENTITIES_CRF, TRAIN_CONFIDENCES_CRF, TRAIN_COSINE_SIM_CRF, CONTEXT1_crf, CONTEXT2_crf = pickle.load(open(trainEntities_crf, "rb"))
#load cached entities (speed up)
train_articles, train_titles, train_identifiers, train_downloaded_articles, \
TRAIN_ENTITIES, TRAIN_CONFIDENCES, TRAIN_COSINE_SIM, CONTEXT1, CONTEXT2 = pickle.load(open(trainEntities_ent, "rb"))
test_articles_crf, test_titles_crf, test_identifiers_crf, test_downloaded_articles_crf,\
TEST_ENTITIES_CRF, TEST_CONFIDENCES_CRF, TEST_COSINE_SIM_CRF, CONTEXT_crf, CONTEXT2_crf = pickle.load(open(testEntities_crf, "rb"))
test_articles, test_titles, test_identifiers, test_downloaded_articles,\
TEST_ENTITIES, TEST_CONFIDENCES, TEST_COSINE_SIM, CONTEXT, CONTEXT2 = pickle.load(open(testEntities_ent, "rb"))
In [532]:
from sklearn.linear_model import LogisticRegression as MaxEnt
import copy
import random
import collections
from itertools import izip
import sys, json, pdb, pickle, operator, collections
import predict as p
import warnings
import random
warnings.filterwarnings("ignore")
def dd():
return {}
def ddd():
return collections.defaultdict(dd)
class Classifier(object):
def __init__(self, TRAIN_ENTITIES, TRAIN_CONFIDENCES, TRAIN_COSINE_SIM,\
TEST_ENTITIES, TEST_CONFIDENCES, TEST_COSINE_SIM):
self.TRAIN_ENTITIES = TRAIN_ENTITIES
self.TRAIN_CONFIDENCES = TRAIN_CONFIDENCES
self.TRAIN_COSINE_SIM = TRAIN_COSINE_SIM
self.TEST_ENTITIES = TEST_ENTITIES
self.TEST_CONFIDENCES = TEST_CONFIDENCES
self.TEST_COSINE_SIM = TEST_COSINE_SIM
self.match_orig_feature = True
self.print_query_scores = False
##current_ent_locations is an array of len 4[query, supp_article] to signify which artic
def trainClassifier(self, train_identifiers):
classifier = MaxEnt(solver="lbfgs", verbose=1)
X = []
Y = []
num_neg = 0
max_neg = 1000
for article_index in range(len(self.TRAIN_ENTITIES)):
article = self.TRAIN_ENTITIES[article_index]
for query_index in range(len(article)):
query = article[query_index]
for supporting_article_index in range(1,len(query)):
features = self.getFeatures(article_index, query_index, supporting_article_index, \
self.TRAIN_ENTITIES, self.TRAIN_CONFIDENCES,self.TRAIN_COSINE_SIM, CONTEXT1)
labels = self.getLabels(article_index, query_index, supporting_article_index, \
self.TRAIN_ENTITIES, train_identifiers)
for label in labels:
if label == 5:
if num_neg < max_neg:
num_neg+=1
X.append(features)
Y.append(label)
else:
X.append(features)
Y.append(label)
assert( len(X) == len(Y))
print "Class dist", [sum([y == i for y in Y])for i in range(6)]
print "Total labels", len(Y)
classifier.fit(X,Y)
return classifier
def predictEntities(self, classifier):
# print "Classifier coef", classifier.coef_
# print "Classifeir intercept", classifier.intercept_
predictions = [0,0,0,0,0,0]
DECISIONS = copy.deepcopy(self.TEST_ENTITIES)
i = 0
for article_index in range(len(self.TEST_ENTITIES)):
article = self.TEST_ENTITIES[article_index]
for query_index in range(len(article)):
query = article[query_index]
for supporting_article_index in range(len(query)):
if supporting_article_index == 0:
DECISIONS[article_index][query_index]\
[supporting_article_index] = [1, 1, 1, 1]
continue
DECISIONS[article_index][query_index]\
[supporting_article_index] = [0, 0, 0, 0]
features = self.getFeatures(article_index, query_index, supporting_article_index, self.TEST_ENTITIES, self.TEST_CONFIDENCES,\
self.TEST_COSINE_SIM, CONTEXT2)
# assert len(features) == 41
prediction = classifier.predict(features)[0]
predictions[prediction] += 1
if prediction < 4:
DECISIONS[article_index][query_index]\
[supporting_article_index][prediction] = 1
elif prediction == 4:
DECISIONS[article_index][query_index]\
[supporting_article_index] = [1, 1, 1, 1]
return DECISIONS
#Run both Max Confidence and Majority Aggregation Schemes given the decisions
#Return the decided tag for each query
def aggregateResults(self, DECISIONS):
majority = []
max_conf = []
for article_index in range(len(self.TEST_ENTITIES)):
max_conf.append([])
majority.append([])
article = self.TEST_ENTITIES[article_index]
for query_index in range(len(article)):
max_conf[article_index].append([])
majority[article_index].append([])
query = article[query_index]
for entity_index in range(4):
max_confidence = -1
max_confidence_tag = ''
tag_occurances = {}
for supporting_article_index in range(len(query)):
supporting_article = query[supporting_article_index]
if DECISIONS[article_index][query_index][supporting_article_index]\
[entity_index] == 0:
continue
confidence = self.TEST_CONFIDENCES[article_index][query_index]\
[supporting_article_index][entity_index]
entity = supporting_article[entity_index].strip().lower()
# assert(not entity == '')
##Update counts of majority
if entity not in tag_occurances:
tag_occurances[entity] = 1
else:
tag_occurances[entity] += 1
##Update max_confidence
if confidence > max_confidence:
max_confidence = confidence
max_confidence_tag = entity
max_majority_count = -1
majority_tag = ''
for ent in tag_occurances:
if tag_occurances[ent] > max_majority_count:
max_majority_count = tag_occurances[ent]
majority_tag = ent
max_conf[article_index][query_index].append(max_confidence_tag)
majority[article_index][query_index].append(majority_tag)
return majority, max_conf
def evaluateBaseline(self, predicted_identifiers, test_identifiers, COUNT_ZERO):
for entity_index in range(4):
num_queries = 5
predicted_correct = [0.] * num_queries
total_predicted = [0.] * num_queries
total_gold = [0.] * num_queries
for article_index in range(len(predicted_identifiers)):
## TODO: Add classifier for selecting query index?
for query_index in range(len(predicted_identifiers[article_index])):
predicted = predicted_identifiers[article_index][query_index][entity_index].strip().lower()
gold = test_identifiers[article_index][entity_index].strip().lower()
if gold == '' or (not COUNT_ZERO and gold == 'zero'):
continue
#special handling for shooterName (lenient eval)
if entity_index == 0:
predicted = set(predicted.split('|'))
gold = set(gold.split('|'))
correct = gold.intersection(predicted)
predicted_correct[query_index] += (1 if len(correct)>0 else 0)
total_predicted[query_index] += 1
total_gold[query_index] += 1
else:
total_predicted[query_index] += 1
if predicted == gold:
predicted_correct[query_index] += 1
total_gold[query_index] += 1
print "Entity", entity_index, ":",
if sum(total_predicted) == 0 :
continue
if sum(predicted_correct) == 0 :
continue
if self.print_query_scores:
print "BEGINNING WITH PER QUERY SCORES"
for query_index in range(num_queries):
print "*********************************************"
print
print "QUERY INDEX:", query_index
self.displayScore(predicted_correct[query_index], total_predicted[query_index],\
total_gold[query_index])
print
print "*********************************************"
print "NOW SHOWING SCORES AGGREGATED OVER ALL QUERRIES"
self.displayScore(sum(predicted_correct), sum(total_predicted),sum(total_gold))
def displayScore(self, predicted_correct, total_predicted, total_gold):
precision = predicted_correct / total_predicted
recall = predicted_correct / total_gold
f1 = (2*precision*recall)/(precision+recall)
print
print "PRECISION", precision, "RECALL", recall, "F1", f1
print "Total match", predicted_correct
def trainAndEval(self, train_identifiers, test_identifiers, COUNT_ZERO):
classifier = self.trainClassifier(train_identifiers)
DECISIONS = self.predictEntities(classifier)
debug = True
if debug:
self.runExploratoryTests(DECISIONS, train_identifiers, test_identifiers)
majority, max_conf = self.aggregateResults(DECISIONS)
print "#############################################################"
print "Evaluation for Classifier baseline with MAJORITY aggregation"
print
self.evaluateBaseline(majority, test_identifiers, COUNT_ZERO)
print
print "#############################################################"
print "Evaluation for Classifier baseline with MAX CONFIDENCE aggregation"
print
self.evaluateBaseline(max_conf, test_identifiers, COUNT_ZERO)
print
print "#############################################################"
def getFeatures(self, article_index, query_index, supporting_article_index, entities, confidences, cosine_sim, context):
features= []
#Construct feature vector for this sampled entity
original_confidence = confidences[article_index][query_index][0]
confidence = confidences[article_index][query_index][supporting_article_index]
#One hot vector to show if entity matches orginal
original_entity = entities[article_index][query_index][0]
new_entity = entities[article_index][query_index][supporting_article_index]
match_features = []
for e_index in range(len(original_entity)):
if original_entity[e_index] == '':
match_features += [0, 1]
elif original_entity[e_index].strip().lower() == new_entity[e_index].strip().lower():
match_features += [1, 0]
else:
match_features += [0, 1]
# Cosine sim array is shifted by one.
# Index 0 should be 1 as orig is same as itself.
tfidf = 1 if supporting_article_index == 0 else \
cosine_sim[article_index]\
[query_index][supporting_article_index - 1]
features = original_confidence+ confidence + match_features + [tfidf]
for c in context[article_index][query_index][supporting_article_index]:
features += c
# assert len(features) == 41
return features
def getLabels(self, article_index, query_index, supporting_article_index, entities, identifier):
#Extract out label for this article (ie. is label correct)
labels = []
gold_entities = identifier[article_index]
new_entities = entities[article_index][query_index][supporting_article_index]
orig_entities = entities[article_index][query_index][0]
for ind in range(len(gold_entities)):
ent = new_entities[ind].lower().strip()
orig_ent = orig_entities[ind].lower().strip()
gold = gold_entities[ind].lower().strip()
if gold == "":
continue
if ent == "":
continue
#special handling for shooterName (entity_index = 0)
if ind == 0:
new_person = set(ent.split('|'))
gold_person = set(gold.split('|'))
if len(new_person.intersection(gold_person)) > 0:
if not ent == orig_ent:
labels.append(ind)
else:
if gold == ent:
if not ent == orig_ent:
labels.append(ind)
if labels == [0, 1, 2, 3]:
labels = [4]
elif labels == []:
labels = [5]
assert (len(labels) > 0)
return labels
def runExploratoryTests(self, DECISIONS, train_identifiers, test_identifiers):
print "TRAIN:: Exploring how many times gold entity is not in original document"
count = collections.defaultdict(lambda:0.)
total_count = collections.defaultdict(lambda:0.)
for article_index in range(len(self.TRAIN_ENTITIES)):
article = self.TRAIN_ENTITIES[article_index]
for entity_index in range(4):
for query_index in range(len(article)):
query = article[query_index]
for supp_index in range(len(query)):
orig_entity = query[0][entity_index].strip().lower()
entity = query[supp_index][entity_index].strip().lower()
gold_ent = train_identifiers[article_index][entity_index].strip().lower()
if gold_ent == "" or gold_ent == 'zero':
continue
if entity == orig_entity:
continue
if entity_index > 0: #not shooter
if entity == gold_ent:
count[entity_index] += 1
else:
orig_entity = set(orig_entity.split('|'))
gold = set(gold_ent.split('|'))
entity = set(entity.split('|'))
if len(entity.intersection(gold)) > len(orig_entity.intersection(gold)):
count[entity_index] += 1
total_count[entity_index] +=1
# print "COUNT ", count
# print "TOTAL ", total_count
print "ENTS: counts gold not in orig" , [count[i] for i in range(4)]
print "Ratio" , [count[i]/total_count[i] for i in range(4)]
print "TEST: Exploring if classifier ever chooses not first entity"
ones_not_orig = [0] * 4
ones = [0] * 4
counts = [0] * 4
for entity_index in range(4):
for article_index in range(len(self.TEST_ENTITIES)):
article = self.TEST_ENTITIES[article_index]
for query_index in range(len(article)):
query = article[query_index]
orig_entity = query[0][entity_index].strip().lower()
gold = test_identifiers[article_index][entity_index].strip().lower()
for supp_index in range(len(query)):
decision = DECISIONS[article_index][query_index][supp_index][entity_index]
counts[entity_index] += 1
if decision == 1:
entity = query[supp_index][entity_index].strip().lower()
ones[entity_index] += 1
if not entity == orig_entity and not entity == "":
ones_not_orig[entity_index] += 1
print "DECS: counts Chosen not in orig" , ones_not_orig
print "DECS: counts ONES " , ones
print "Ratio one not matching original entity in prediction", [ ones_not_orig[x]*1. / counts[x] for x in range(4)]
In [406]:
In [533]:
verbose = False
count_name = 0
print "size", len(TEST_CONFIDENCES_CRF)
for entity_ind in range(4):
entity_name = p.int2tags[entity_ind+1]
correct = 0
gold_num = 0
total = 0
for article_ind in range(len(TEST_ENTITIES_CRF)):
article = TEST_ENTITIES_CRF[article_ind]
gold_ent = test_identifiers[article_ind][entity_ind].strip().lower()
for query_ind in range(len(article)):
query = article[query_ind]
# for sup_ind in range(len(query)):
sup = query[0]
ent = sup[entity_ind].strip().lower()
if gold_ent == '' or (True and gold_ent == 'zero'):
continue
if entity_ind == 0:
gold_ent_set = set(gold_ent.split('|'))
correct_int = gold_ent_set.intersection(ent)
correct += (1 if len(correct_int)>0 else 0)
if not ent == set(['']):
count_name += 1
gold_num += 1
total += 1
else:
if ent == gold_ent:
correct += 1
gold_num += 1
total += 1
prec = correct*1./total
recall = correct*1./gold_num
f1 = 0
if not prec + recall == 0:
f1 = 2*(prec*recall)/(prec+recall)
# print entity_name
# print "prec", prec, "recall", recall, "f1:" , f1
# baseline = Classifier(TRAIN_ENTITIES_CRF, TRAIN_CONFIDENCES_CRF, TRAIN_COSINE_SIM_CRF,\
# TEST_ENTITIES_CRF, TEST_CONFIDENCES_CRF, TEST_COSINE_SIM_CRF)
baseline = Classifier(TRAIN_ENTITIES, TRAIN_CONFIDENCES, TRAIN_COSINE_SIM, \
TEST_ENTITIES, TEST_CONFIDENCES, TEST_COSINE_SIM)
baseline.trainAndEval(train_identifiers, test_identifiers, False)
In [459]:
import sklearn
print sklearn.__version__
OLD
shooterName f1: 0.452380952381aka 45.2
killedNum f1: 0.697530864198 aka 69.8
woundedNum f1: 0.686346863469 aka 68.6
city f1: 0.537671232877 aka 53.8
vs:
NEW
shooterName F1 0.452380952381 aka 45.2 diff 0
killedNum F1 0.707407407407 aka 70.7 diff +0.9
woundedNum F1 0.684132841328 aka 68.4 diff -0.2
city F1 0.552739726027 aka 55.3 diff +1.5
net +2.2