In [221]:
testEntities_crf='consolidated2/consolidated/dev+test+context.new.crf.p'
trainEntities_crf='consolidated2/consolidated/train+context.new.crf.p'

testEntities_ent='consolidated/dev+test+context.5.p'
trainEntities_ent='consolidated/train+context.5.p'

train_articles_crf, train_titles_crf, train_identifiers_crf, train_downloaded_articles_crf, \
TRAIN_ENTITIES_CRF, TRAIN_CONFIDENCES_CRF, TRAIN_COSINE_SIM_CRF, CONTEXT1_crf, CONTEXT2_crf = pickle.load(open(trainEntities_crf, "rb"))

#load cached entities (speed up)
train_articles, train_titles, train_identifiers, train_downloaded_articles, \
TRAIN_ENTITIES, TRAIN_CONFIDENCES, TRAIN_COSINE_SIM, CONTEXT1, CONTEXT2 = pickle.load(open(trainEntities_ent, "rb"))


test_articles_crf, test_titles_crf, test_identifiers_crf, test_downloaded_articles_crf,\
TEST_ENTITIES_CRF, TEST_CONFIDENCES_CRF, TEST_COSINE_SIM_CRF, CONTEXT_crf, CONTEXT2_crf = pickle.load(open(testEntities_crf, "rb"))

test_articles, test_titles, test_identifiers, test_downloaded_articles,\
TEST_ENTITIES, TEST_CONFIDENCES, TEST_COSINE_SIM, CONTEXT, CONTEXT2 = pickle.load(open(testEntities_ent, "rb"))

In [532]:
from sklearn.linear_model import LogisticRegression as MaxEnt
import copy
import random
import collections
from itertools import izip
import sys, json, pdb, pickle, operator, collections
import predict as p
import warnings
import random

warnings.filterwarnings("ignore")



def dd():
    return {}

def ddd():
    return collections.defaultdict(dd)

class Classifier(object):

    def __init__(self, TRAIN_ENTITIES, TRAIN_CONFIDENCES, TRAIN_COSINE_SIM,\
                 TEST_ENTITIES, TEST_CONFIDENCES, TEST_COSINE_SIM):
        self.TRAIN_ENTITIES = TRAIN_ENTITIES
        self.TRAIN_CONFIDENCES = TRAIN_CONFIDENCES
        self.TRAIN_COSINE_SIM = TRAIN_COSINE_SIM

        self.TEST_ENTITIES = TEST_ENTITIES
        self.TEST_CONFIDENCES = TEST_CONFIDENCES
        self.TEST_COSINE_SIM = TEST_COSINE_SIM

        self.match_orig_feature = True
        self.print_query_scores = False


    ##current_ent_locations is an array of len 4[query, supp_article] to signify which artic
    def trainClassifier(self, train_identifiers):

        classifier = MaxEnt(solver="lbfgs", verbose=1)
        X = []
        Y = []
        num_neg = 0
        max_neg = 1000
        for article_index in range(len(self.TRAIN_ENTITIES)):
            article = self.TRAIN_ENTITIES[article_index]
            for query_index in range(len(article)):
                query = article[query_index]
                for supporting_article_index in range(1,len(query)):
                    features = self.getFeatures(article_index, query_index, supporting_article_index, \
                                                self.TRAIN_ENTITIES, self.TRAIN_CONFIDENCES,self.TRAIN_COSINE_SIM, CONTEXT1)
                    
                    labels = self.getLabels(article_index, query_index, supporting_article_index, \
                                                self.TRAIN_ENTITIES, train_identifiers)
                        
                    for label in labels:
                        if label == 5: 
                            if num_neg < max_neg:
                                num_neg+=1 
                                X.append(features)
                                Y.append(label)
                        else:
                            X.append(features)
                            Y.append(label)
                assert( len(X) == len(Y))
                
        print "Class dist", [sum([y == i for y in Y])for i in range(6)]
        print "Total labels", len(Y)
        classifier.fit(X,Y)
            
        return classifier

    def predictEntities(self, classifier):
        
#         print "Classifier coef", classifier.coef_
#         print "Classifeir intercept", classifier.intercept_
        predictions = [0,0,0,0,0,0]
        DECISIONS = copy.deepcopy(self.TEST_ENTITIES)
        i = 0
        for article_index in range(len(self.TEST_ENTITIES)):
            article = self.TEST_ENTITIES[article_index]
            for query_index in range(len(article)):
                query = article[query_index]
                for supporting_article_index in range(len(query)):
                    if supporting_article_index == 0:
                        DECISIONS[article_index][query_index]\
                            [supporting_article_index] = [1, 1, 1, 1]
                        continue
                    DECISIONS[article_index][query_index]\
                            [supporting_article_index] = [0, 0, 0, 0] 

                    features = self.getFeatures(article_index, query_index, supporting_article_index, self.TEST_ENTITIES, self.TEST_CONFIDENCES,\
                               self.TEST_COSINE_SIM, CONTEXT2)

#                     assert len(features) == 41
                    prediction = classifier.predict(features)[0]
                    predictions[prediction] += 1
                    if prediction < 4:
                        DECISIONS[article_index][query_index]\
                            [supporting_article_index][prediction] = 1
                    elif prediction == 4:
                        DECISIONS[article_index][query_index]\
                            [supporting_article_index] = [1, 1, 1, 1]
                

        return DECISIONS

    #Run both Max Confidence and Majority Aggregation Schemes given the decisions
    #Return the decided tag for each query
    def aggregateResults(self, DECISIONS):
        majority = []
        max_conf = []
        for article_index in range(len(self.TEST_ENTITIES)):
            max_conf.append([])
            majority.append([])
            article = self.TEST_ENTITIES[article_index]
            for query_index in range(len(article)):
                max_conf[article_index].append([])
                majority[article_index].append([])
                query = article[query_index]
                for entity_index in range(4):
                    max_confidence = -1
                    max_confidence_tag = ''
                    tag_occurances = {}
                    for supporting_article_index in range(len(query)):
                        supporting_article = query[supporting_article_index]
                        if DECISIONS[article_index][query_index][supporting_article_index]\
                           [entity_index] == 0:
                            continue


                        confidence = self.TEST_CONFIDENCES[article_index][query_index]\
                                [supporting_article_index][entity_index]
                        entity = supporting_article[entity_index].strip().lower()
#                         assert(not entity == '')

                        ##Update counts of majority
                        if entity not in tag_occurances:
                            tag_occurances[entity] = 1
                        else:
                            tag_occurances[entity] += 1

                        ##Update max_confidence
                        if confidence > max_confidence:
                            max_confidence = confidence
                            max_confidence_tag = entity
                    max_majority_count = -1
                    majority_tag = ''
                    for ent in tag_occurances:
                        if tag_occurances[ent] > max_majority_count:
                            max_majority_count = tag_occurances[ent]
                            majority_tag = ent
                    max_conf[article_index][query_index].append(max_confidence_tag)
                    majority[article_index][query_index].append(majority_tag)

        return majority, max_conf



    def evaluateBaseline(self, predicted_identifiers, test_identifiers, COUNT_ZERO):
        for entity_index in range(4):
            num_queries = 5
            predicted_correct = [0.] * num_queries
            total_predicted   = [0.] * num_queries
            total_gold        = [0.] * num_queries

            for article_index in range(len(predicted_identifiers)):
                ## TODO: Add classifier for selecting query index?
                for query_index in range(len(predicted_identifiers[article_index])):        
                    predicted = predicted_identifiers[article_index][query_index][entity_index].strip().lower()
                    gold = test_identifiers[article_index][entity_index].strip().lower()
                    if gold == '' or (not COUNT_ZERO and gold == 'zero'):
                        continue


                    #special handling for shooterName (lenient eval)
                    if entity_index == 0:
                        predicted = set(predicted.split('|'))
                        gold = set(gold.split('|'))
                        correct = gold.intersection(predicted)
                        predicted_correct[query_index] += (1 if len(correct)>0 else 0)
                        total_predicted[query_index] += 1
                        total_gold[query_index] += 1 
                    else:
                        total_predicted[query_index] += 1
                        if predicted == gold:
                            predicted_correct[query_index] += 1
                        total_gold[query_index] += 1


            print "Entity", entity_index, ":",
            if sum(total_predicted) == 0 :
                continue

            if sum(predicted_correct) == 0 :
                continue

            if  self.print_query_scores:
                print "BEGINNING WITH PER QUERY SCORES"

                for query_index in range(num_queries):
                    print "*********************************************"
                    print
                    print "QUERY INDEX:", query_index
                    self.displayScore(predicted_correct[query_index], total_predicted[query_index],\
                                      total_gold[query_index])
                    print
                    print "*********************************************"
                print "NOW SHOWING SCORES AGGREGATED OVER ALL QUERRIES"
            self.displayScore(sum(predicted_correct), sum(total_predicted),sum(total_gold))

    def displayScore(self, predicted_correct, total_predicted, total_gold):
        precision = predicted_correct / total_predicted
        recall = predicted_correct / total_gold
        f1 = (2*precision*recall)/(precision+recall)
        print
        print "PRECISION", precision, "RECALL", recall, "F1", f1
        print "Total match", predicted_correct

    def trainAndEval(self, train_identifiers, test_identifiers, COUNT_ZERO):
        classifier = self.trainClassifier(train_identifiers)
        DECISIONS  = self.predictEntities(classifier)

        debug = True
        if debug:
            self.runExploratoryTests(DECISIONS, train_identifiers, test_identifiers)
            

        majority, max_conf = self.aggregateResults(DECISIONS)
        print "#############################################################"
        print "Evaluation for Classifier baseline with MAJORITY aggregation"
        print
        self.evaluateBaseline(majority, test_identifiers, COUNT_ZERO)

        print
        print "#############################################################"
        print "Evaluation for Classifier baseline with MAX CONFIDENCE aggregation"
        print
        self.evaluateBaseline(max_conf, test_identifiers, COUNT_ZERO)
        print
        print "#############################################################"
    
    def getFeatures(self, article_index, query_index, supporting_article_index, entities, confidences, cosine_sim, context):        
        features= []

        #Construct feature vector for this sampled entity
        original_confidence = confidences[article_index][query_index][0]
        confidence = confidences[article_index][query_index][supporting_article_index]
        
        #One hot vector to show if entity matches orginal
        original_entity = entities[article_index][query_index][0]
        new_entity = entities[article_index][query_index][supporting_article_index]
        match_features = []
        for e_index in range(len(original_entity)):
            if original_entity[e_index] == '':
                match_features += [0, 1]
            elif original_entity[e_index].strip().lower() == new_entity[e_index].strip().lower():
                match_features += [1, 0]
            else:
                match_features += [0, 1]
        
        # Cosine sim array is shifted by one.
        # Index 0 should be 1 as orig is same as itself.
        tfidf = 1 if supporting_article_index == 0 else \
                cosine_sim[article_index]\
                [query_index][supporting_article_index - 1]

        features = original_confidence+ confidence + match_features + [tfidf]

        for c in context[article_index][query_index][supporting_article_index]:
            features += c            
#          assert len(features) == 41
        
        return features
    
    def getLabels(self, article_index, query_index, supporting_article_index, entities, identifier):
        #Extract out label for this article (ie. is label correct)
        labels = []
        gold_entities = identifier[article_index]
        new_entities      = entities[article_index][query_index][supporting_article_index]
        orig_entities     = entities[article_index][query_index][0]
        for ind in range(len(gold_entities)):
            ent = new_entities[ind].lower().strip()
            orig_ent = orig_entities[ind].lower().strip()
            gold = gold_entities[ind].lower().strip()
            if gold == "":
                continue
            if ent == "":
                continue
            
            #special handling for shooterName (entity_index = 0)
            if ind == 0:
                new_person = set(ent.split('|'))
                gold_person = set(gold.split('|'))
                if len(new_person.intersection(gold_person)) > 0:
                    if not ent == orig_ent:
                        labels.append(ind)
            else:
                if gold == ent:
                    if not ent == orig_ent:
                        labels.append(ind)
        if labels == [0, 1, 2, 3]:
            labels = [4]
        elif labels == []:
            labels = [5]
        
        assert (len(labels) > 0)
        return labels
    
            
    def runExploratoryTests(self, DECISIONS, train_identifiers, test_identifiers):
        print "TRAIN:: Exploring how many times gold entity is not in original document"
        count = collections.defaultdict(lambda:0.)
        total_count = collections.defaultdict(lambda:0.)
        for article_index in range(len(self.TRAIN_ENTITIES)):
            article = self.TRAIN_ENTITIES[article_index]
            for entity_index in range(4):
                for query_index in range(len(article)):
                    query = article[query_index]
                    for supp_index in range(len(query)):
                        orig_entity = query[0][entity_index].strip().lower()
                        entity = query[supp_index][entity_index].strip().lower()
                        gold_ent = train_identifiers[article_index][entity_index].strip().lower() 
                        if gold_ent == "" or gold_ent == 'zero':
                            continue

                        if entity == orig_entity:
                            continue
                        if entity_index > 0: #not shooter
                            if entity == gold_ent:
                                count[entity_index] += 1
                        else:
                            orig_entity = set(orig_entity.split('|'))
                            gold = set(gold_ent.split('|'))
                            entity = set(entity.split('|'))
                            if len(entity.intersection(gold)) > len(orig_entity.intersection(gold)):
                                count[entity_index] += 1                            
                        total_count[entity_index] +=1

#         print "COUNT ", count
#         print "TOTAL ", total_count
        print "ENTS: counts gold not in orig" , [count[i] for i in range(4)]
        print "Ratio" , [count[i]/total_count[i] for i in range(4)]

        print "TEST: Exploring if classifier ever chooses not first entity"
        ones_not_orig = [0] * 4
        ones = [0] * 4
        counts = [0] * 4
        for entity_index in range(4):
            for article_index in range(len(self.TEST_ENTITIES)):
                article = self.TEST_ENTITIES[article_index]
                for query_index in range(len(article)):
                    query = article[query_index]
                    orig_entity = query[0][entity_index].strip().lower()
                    gold = test_identifiers[article_index][entity_index].strip().lower()
                    for supp_index in range(len(query)):
                        decision = DECISIONS[article_index][query_index][supp_index][entity_index]
                        counts[entity_index] += 1
                        if decision == 1:
                            entity = query[supp_index][entity_index].strip().lower()
                            ones[entity_index] += 1
                            if not entity == orig_entity and not entity == "":
                                ones_not_orig[entity_index] += 1

        print "DECS: counts Chosen not in orig" , ones_not_orig
        print "DECS: counts ONES " , ones
        print "Ratio one not matching original entity in prediction", [ ones_not_orig[x]*1. / counts[x] for x in range(4)]

In [406]:





In [533]:
verbose = False

count_name = 0
print "size", len(TEST_CONFIDENCES_CRF)

for entity_ind in range(4):
    entity_name = p.int2tags[entity_ind+1]
    correct = 0
    gold_num = 0
    total = 0
    for article_ind in range(len(TEST_ENTITIES_CRF)):
        article = TEST_ENTITIES_CRF[article_ind]
        gold_ent = test_identifiers[article_ind][entity_ind].strip().lower()
        for query_ind in range(len(article)):
            query = article[query_ind]
#             for sup_ind in range(len(query)):
                

            sup = query[0]

            ent = sup[entity_ind].strip().lower()
            
            if gold_ent == '' or (True and gold_ent == 'zero'):
                    continue
            if entity_ind == 0:
                    gold_ent_set = set(gold_ent.split('|'))
                    correct_int = gold_ent_set.intersection(ent)
                    correct += (1 if len(correct_int)>0 else 0)
                    if not ent == set(['']):
                        count_name += 1
                    gold_num += 1
                    total += 1
            else:

                if ent == gold_ent: 
                    correct += 1
                gold_num += 1
                total   += 1

    prec = correct*1./total
    recall = correct*1./gold_num
    f1 = 0
    if not prec + recall == 0:
        f1 = 2*(prec*recall)/(prec+recall)
#     print entity_name
#     print "prec", prec, "recall", recall, "f1:" , f1
            
# baseline = Classifier(TRAIN_ENTITIES_CRF, TRAIN_CONFIDENCES_CRF, TRAIN_COSINE_SIM_CRF,\
#              TEST_ENTITIES_CRF, TEST_CONFIDENCES_CRF, TEST_COSINE_SIM_CRF)
   
baseline = Classifier(TRAIN_ENTITIES, TRAIN_CONFIDENCES, TRAIN_COSINE_SIM, \
                      TEST_ENTITIES, TEST_CONFIDENCES, TEST_COSINE_SIM)
            
                                                                                                                                                                                                    
        
baseline.trainAndEval(train_identifiers, test_identifiers, False)


size 292
Class dist [124, 170, 193, 260, 0, 1000]
Total labels 1747
TRAIN:: Exploring how many times gold entity is not in original document
ENTS: counts gold not in orig [4.0, 140.0, 93.0, 260.0]
Ratio [0.0021287919105907396, 0.041716328963051254, 0.015361744301288404, 0.036775106082036775]
TEST: Exploring if classifier ever chooses not first entity
DECS: counts Chosen not in orig [23, 176, 274, 251]
DECS: counts ONES  [1484, 1636, 1734, 1713]
Ratio one not matching original entity in prediction [0.0024562152926099956, 0.018795386586928663, 0.02926099957283212, 0.02680478428022213]
#############################################################
Evaluation for Classifier baseline with MAJORITY aggregation

Entity 0 :
PRECISION 0.447619047619 RECALL 0.447619047619 F1 0.447619047619
Total match 94.0
Entity 1 :
PRECISION 0.704938271605 RECALL 0.704938271605 F1 0.704938271605
Total match 571.0
Entity 2 :
PRECISION 0.670848708487 RECALL 0.670848708487 F1 0.670848708487
Total match 909.0
Entity 3 :
PRECISION 0.549315068493 RECALL 0.549315068493 F1 0.549315068493
Total match 802.0

#############################################################
Evaluation for Classifier baseline with MAX CONFIDENCE aggregation

Entity 0 :
PRECISION 0.452380952381 RECALL 0.452380952381 F1 0.452380952381
Total match 95.0
Entity 1 :
PRECISION 0.707407407407 RECALL 0.707407407407 F1 0.707407407407
Total match 573.0
Entity 2 :
PRECISION 0.684132841328 RECALL 0.684132841328 F1 0.684132841328
Total match 927.0
Entity 3 :
PRECISION 0.552739726027 RECALL 0.552739726027 F1 0.552739726027
Total match 807.0

#############################################################
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished

In [459]:
import sklearn
print sklearn.__version__

OLD
shooterName f1: 0.452380952381aka 45.2
killedNum f1: 0.697530864198  aka 69.8
woundedNum f1: 0.686346863469 aka 68.6
city f1: 0.537671232877       aka 53.8
    
    
vs:
NEW    
shooterName F1 0.452380952381 aka 45.2   diff 0
killedNum   F1 0.707407407407 aka 70.7   diff +0.9
woundedNum  F1 0.684132841328 aka 68.4   diff -0.2
city F1 0.552739726027        aka 55.3   diff +1.5
                                         net  +2.2


  File "<ipython-input-459-3678b1ed92bd>", line 4
    shooterName f1: 0.452380952381
                 ^
SyntaxError: invalid syntax