Mention Detection

Jiarui Xu - jxu57@illinois.edu


In [1]:
import json
import pyprind
import sys
import pickle

In [2]:
data_folder = "/Volumes/backup/ccg_tweet_wikifier_data/"
wikidata_file = "/Volumes/backup/ccg_tweet_wikifier_data/wikidata/wikidata-20160404-all.json"
entity_alias_output_file = data_folder+"wikidata/entity_alias.txt"

In [3]:
from corenlp import *
corenlp = StanfordCoreNLP()


Loading Models: 5/5                                                            

1 Wikidata Alias

In this section, we use wikidata to build lexicon, if a n-gram exists in lexicon, then we consider it a mention candidate.

1.1 Load Wikidata json file


In [4]:
def join_by_tab(dic):
    """
    Join each items in input
    """
    val = ""
    val += dic.keys()[0] # english label
    val += "\t"
    val += "\t".join(dic.values()[0]) # aliases
    val += "\n"
    return val

def find_en_aliases(entity):
    """
    Return a list [label, alias_0, alias_1 ... ] for a given entity
    """
    
    ret = {}
    entity_id = entity[u'id']
    
    try:
        ret[entity_id]= [entity[u'labels'][u'en'][u'value']]
    except:
        ret[entity_id] = ["NONE_EN_LABEL"]
    
    try:
        ret[entity_id].extend([element['value'] for element in entity[u'aliases'][u'en']])
    except:
        pass
    
    return ret

def load_wikidata(wikidata_file, output_file):
    
    line_count = 20951710    # line count of 04_04 wikidata
    
    # for progress bar
    bar = pyprind.ProgBar(line_count, width=70, monitor = True)
    
    # set up error statistics
    errors = {}
    json_errors = []
    
    count = 0
    
    # write to file
    with open(output_file, "w") as g:
        with open(wikidata_file, "rb") as f:
            for line in f:
                
                # update progress bar
                bar.update()
                
                try:
                    # load entity from the line being reading
                    entity_content = json.loads(line.strip()[:-1])
                    try:
                        # get aliases and connect them by tab
                        output = join_by_tab(find_en_aliases(entity_content))
                        g.write(output.encode('utf8'))
                    except:
                        errors[entity_content[u'id']] = sys.exc_info()[0]
                except:
                    json_errors.append(sys.exc_info()[0])

    print json_errors, errors

Load Wikidata and extract aliases


In [5]:
# Unblock to load

# load_wikidata(wikidata_file, entity_alias_output_file)

In [6]:
entity_alias_file = entity_alias_output_file

file_formate is as follows:

wikidata_id label alias_1 .... alias_n

b. Build Alias Mapping


In [7]:
alias_entity_file = data_folder + "wikidata/alias_entity.txt"

(1) reverse mapping


In [8]:
def reverse_mapping(src_file):
    """
    Build a mapping from aliasn to entity_list
    """
    # for progress bar
    line_count = 20951708
    bar = pyprind.ProgBar(line_count, width=70, monitor = True)
    
    a2e = {}
    
    with open(src_file, "rb") as f:
        for line in f:
            bar.update()
            segments = line.strip().split("\t")
            entity = segments[0]
            for seg in segments[1:]:
                if seg not in a2e:
                    a2e[seg] = set()
                a2e[seg].add(entity)

    return a2e

In [9]:
alias_to_entity = reverse_mapping(entity_alias_file)


0%                                                                  100%
[######################################################################] | ETA: 00:00:00

In [ ]:
entity_alias_output_txt_file = data_folder+"wikidata/alias2entity.txt"

bar = pyprind.ProgBar(len(alias_to_entity, width=70, monitor = True)
    
with open(entity_alias_output_txt_file, "wb") as f:
    for key in alias_to_entity.keys():
        bar.update()
        line = [key]
        line.extend(alias_to_entity[key])
        
        text = "\t".join(line)
        
                      
                      
                      entity = segments[0]
        for seg in segments[1:]:
        if seg not in a2e:
            a2e[seg] = set()
        a2e[seg].add(entity)

In [ ]:
entity_alias_output_file = data_folder+"wikidata/alias2entity.pickle"

with open(entity_alias_output_file, "wb") as f:
    pickle.dump(alias_to_entity, f)

(2) to lower case


In [ ]:
alias_to_entity_lower = {}
for als in alias_to_entity.keys():
    als_lower = als.lower()
    if als_lower in alias_to_entity_lower:
        alias_to_entity_lower[als_lower] |= alias_to_entity[als]
    else:
        alias_to_entity_lower[als_lower] = alias_to_entity[als]

In [ ]:
# dump the mapping to file
entity_alias_output_file = data_folder+"wikidata/alias2entity_lower.pickle"

with open(wikidata_file + "alias2entity.pickle", "wb") as f:
    pickle.dump(alias_to_entity_lower, f)

c. Experiment with Golden Standards

load tweet corpus


In [ ]:
with open(data_folder+"Tweet/NEEL_tweets(with_grams).pickle", "rb") as f:
    tweet_corpus = pickle.load(f)

In [ ]:
from stop_words import get_stop_words

stop_words = get_stop_words('en')
stop_words = get_stop_words('english')

from stop_words import safe_get_stop_words

stop_words = safe_get_stop_words('unsupported language')

In [ ]:
stop_words = get_stop_words('en')

In [ ]:
stop_words

(1) First experiment: How n-grams match with our golden mentions


In [ ]:
def remove_special(text):
    if text[0] in ['$', '#', "@"]:
        try:
            return text[1:]
        except:
            return text
    else:
        return text

In [ ]:
def experiment_gram_matching(tweets):
    total = 0
    match = 0
    try:
        for tweet in tweets.values():
            goldens = tweet['goldens']
            for g in goldens:
                total += 1
                mention = g['mention']

                gram_set = set()
                for grams in tweet['ngrams'].values():
                    for gram in grams:
                        gram_set.add(remove_special(gram))

                if mention in gram_set:
                    match += 1
                else:
                    # pass
                    print tweet['tweet_info']['id']
                    print tweet['tweet_info']['text']
                    print "MENTION:", mention
                    print(type(mention))
                    print "======"
    except:
        print tweet['tweet_info']['id']
    return [match, total]

In [ ]:
res = experiment_gram_matching(tweet_corpus)

In [ ]:
res

In [ ]:
for tweet in tweet_corpus.values():
    tweet["gram_set"] = set()
    for gram_set in tweet["ngrams"].values():
        tweet["gram_set"] |= set(gram_set)
    tweet["mention_set"] = set([item['mention'].lower() for item in tweet['goldens']])

In [ ]:
stats = {"tp":0., "fp":0., "tn":0., "fn":0.}

for tweet in tweet_corpus.values():
    for gram in tweet['gram_set']:
        gram_low = gram.lower()
        if len(gram_low) < 2:
            continue
        if gram_low in stop_words:
            continue
        if gram_low in alias_to_entity_lower:
            if gram_low in tweet['mention_set']:
                stats['tp'] +=1
            else:
                stats['fp'] +=1
                print gram_low
        else:
            if gram_low in tweet['mention_set']:
                stats['fn'] +=1
            else:
                stats['tn'] +=1

In [ ]:
def check_upper(text):
    for c in text:
        if c.isupper():
            return True
    return False

In [ ]:
for tweet in tweet_corpus.values():
    tweet["gram_set"] = set()
    for gram_set in tweet["ngrams"].values():
        tweet["gram_set"] |= set(gram_set)
    tweet["mention_set"] = set([item['mention'] for item in tweet['goldens']])
    
stats = {"tp":0., "fp":0., "tn":0., "fn":0.}

for tweet in tweet_corpus.values():
    print "======="
    print tweet['tweet_info']['id']
    print tweet['tweet_info']['text']
    print tweet['mention_set']
    for gram in tweet['gram_set']:
        gram_low = gram
        if len(gram_low) <= 3:
            continue
        if gram_low in stop_words:
            continue
        
        # if check_upper(gram_low) == False:
            
        elif gram_low in alias_to_entity and check_upper(gram_low):
            
            if gram_low in tweet['mention_set']:
                stats['tp'] +=1
            else:
                stats['fp'] +=1
                print gram_low
        else:
            if gram_low in tweet['mention_set']:
                stats['fn'] +=1
            else:
                stats['tn'] +=1

In [ ]:
stats

In [ ]:
precision = stats['tp']/(stats['tp']+stats['fp'])
recall = stats['tp']/(stats['tp']+stats['fn'])

F = 2*precision*recall/(precision+recall)

In [ ]:
print precision, recall, F

In [ ]:
alias_to_entity['screenwriter']

In [ ]:
recall

In [ ]:
def experiment_alias(tweets, alias_mapper):
    length_sum = 0
    match = 0
    hit = 0
    total = 0
    for tweet in tweets.values():
        goldens = tweet['goldens']
        for g in goldens:
            total += 1
            mention = g['mention']
            real_mention = mention
            
            if mention in tweet['cashtag_mapping']:
                real_mention = tweet['cashtag_mapping'][mention]['text']
            elif mention in tweet['hashtag_mapping']:
                real_mention = tweet['hashtag_mapping'][mention]['text']
            elif mention in tweet['url_mapping']:
                real_mention = tweet['url_mapping'][mention]['url']
            elif mention in tweet['usermention_mapping']:
                real_mention = tweet['usermention_mapping'][mention]['name']
            
            low = real_mention.lower()
            if low in alias_mapper:
                match += 1
                length_sum += len(alias_mapper[low])
            else:
                print tweet['tweet_info']['id'], real_mention, "|", g['wiki_title']
    print total, match, hit, length_sum

Lower the cases


In [ ]:
alias_to_entity_lower = {}
for als in alias_to_entity.keys():
    als_lower = als.lower()
    if als_lower in alias_to_entity_lower:
        alias_to_entity_lower[als_lower] |= alias_to_entity[als]
    else:
        alias_to_entity_lower[als_lower] = alias_to_entity[als]

In [ ]:
experiment_alias(tweet_corpus, alias_to_entity_lower)

In [ ]:
for s in alias_to_entity_lower:
    pass

In [ ]:
15124./1981

exact: 2420 1800
lower_case: 2420 1981


In [ ]: