In [1]:
import json
import pyprind
import sys
import pickle
In [2]:
data_folder = "/Volumes/backup/ccg_tweet_wikifier_data/"
wikidata_file = "/Volumes/backup/ccg_tweet_wikifier_data/wikidata/wikidata-20160404-all.json"
entity_alias_output_file = data_folder+"wikidata/entity_alias.txt"
In [3]:
from corenlp import *
corenlp = StanfordCoreNLP()
In this section, we use wikidata to build lexicon, if a n-gram exists in lexicon, then we consider it a mention candidate.
In [4]:
def join_by_tab(dic):
"""
Join each items in input
"""
val = ""
val += dic.keys()[0] # english label
val += "\t"
val += "\t".join(dic.values()[0]) # aliases
val += "\n"
return val
def find_en_aliases(entity):
"""
Return a list [label, alias_0, alias_1 ... ] for a given entity
"""
ret = {}
entity_id = entity[u'id']
try:
ret[entity_id]= [entity[u'labels'][u'en'][u'value']]
except:
ret[entity_id] = ["NONE_EN_LABEL"]
try:
ret[entity_id].extend([element['value'] for element in entity[u'aliases'][u'en']])
except:
pass
return ret
def load_wikidata(wikidata_file, output_file):
line_count = 20951710 # line count of 04_04 wikidata
# for progress bar
bar = pyprind.ProgBar(line_count, width=70, monitor = True)
# set up error statistics
errors = {}
json_errors = []
count = 0
# write to file
with open(output_file, "w") as g:
with open(wikidata_file, "rb") as f:
for line in f:
# update progress bar
bar.update()
try:
# load entity from the line being reading
entity_content = json.loads(line.strip()[:-1])
try:
# get aliases and connect them by tab
output = join_by_tab(find_en_aliases(entity_content))
g.write(output.encode('utf8'))
except:
errors[entity_content[u'id']] = sys.exc_info()[0]
except:
json_errors.append(sys.exc_info()[0])
print json_errors, errors
In [5]:
# Unblock to load
# load_wikidata(wikidata_file, entity_alias_output_file)
In [6]:
entity_alias_file = entity_alias_output_file
file_formate is as follows:
wikidata_id label alias_1 .... alias_n
In [7]:
alias_entity_file = data_folder + "wikidata/alias_entity.txt"
In [8]:
def reverse_mapping(src_file):
"""
Build a mapping from aliasn to entity_list
"""
# for progress bar
line_count = 20951708
bar = pyprind.ProgBar(line_count, width=70, monitor = True)
a2e = {}
with open(src_file, "rb") as f:
for line in f:
bar.update()
segments = line.strip().split("\t")
entity = segments[0]
for seg in segments[1:]:
if seg not in a2e:
a2e[seg] = set()
a2e[seg].add(entity)
return a2e
In [9]:
alias_to_entity = reverse_mapping(entity_alias_file)
In [12]:
entity_alias_output_txt_file = data_folder+"wikidata/alias2entity.txt"
bar = pyprind.ProgBar(len(alias_to_entity), width=70, monitor = True)
with open(entity_alias_output_txt_file, "wb") as f:
for key in alias_to_entity.keys():
bar.update()
line = [key]
line.extend(alias_to_entity[key])
text = "\t".join(line)
f.write(text+"\n")
In [13]:
len(alias_to_entity)
Out[13]:
In [ ]:
entity_alias_output_file = data_folder+"wikidata/alias2entity.pickle"
with open(entity_alias_output_file, "wb") as f:
pickle.dump(alias_to_entity, f)
In [ ]:
alias_to_entity_lower = {}
for als in alias_to_entity.keys():
als_lower = als.lower()
if als_lower in alias_to_entity_lower:
alias_to_entity_lower[als_lower] |= alias_to_entity[als]
else:
alias_to_entity_lower[als_lower] = alias_to_entity[als]
In [ ]:
# dump the mapping to file
entity_alias_output_file = data_folder+"wikidata/alias2entity_lower.pickle"
with open(wikidata_file + "alias2entity.pickle", "wb") as f:
pickle.dump(alias_to_entity_lower, f)
load tweet corpus
In [ ]:
with open(data_folder+"Tweet/NEEL_tweets(with_grams).pickle", "rb") as f:
tweet_corpus = pickle.load(f)
In [ ]:
from stop_words import get_stop_words
stop_words = get_stop_words('en')
stop_words = get_stop_words('english')
from stop_words import safe_get_stop_words
stop_words = safe_get_stop_words('unsupported language')
In [ ]:
stop_words = get_stop_words('en')
In [ ]:
stop_words
In [ ]:
def remove_special(text):
if text[0] in ['$', '#', "@"]:
try:
return text[1:]
except:
return text
else:
return text
In [ ]:
def experiment_gram_matching(tweets):
total = 0
match = 0
try:
for tweet in tweets.values():
goldens = tweet['goldens']
for g in goldens:
total += 1
mention = g['mention']
gram_set = set()
for grams in tweet['ngrams'].values():
for gram in grams:
gram_set.add(remove_special(gram))
if mention in gram_set:
match += 1
else:
# pass
print tweet['tweet_info']['id']
print tweet['tweet_info']['text']
print "MENTION:", mention
print(type(mention))
print "======"
except:
print tweet['tweet_info']['id']
return [match, total]
In [ ]:
res = experiment_gram_matching(tweet_corpus)
In [ ]:
res
In [ ]:
for tweet in tweet_corpus.values():
tweet["gram_set"] = set()
for gram_set in tweet["ngrams"].values():
tweet["gram_set"] |= set(gram_set)
tweet["mention_set"] = set([item['mention'].lower() for item in tweet['goldens']])
In [ ]:
stats = {"tp":0., "fp":0., "tn":0., "fn":0.}
for tweet in tweet_corpus.values():
for gram in tweet['gram_set']:
gram_low = gram.lower()
if len(gram_low) < 2:
continue
if gram_low in stop_words:
continue
if gram_low in alias_to_entity_lower:
if gram_low in tweet['mention_set']:
stats['tp'] +=1
else:
stats['fp'] +=1
print gram_low
else:
if gram_low in tweet['mention_set']:
stats['fn'] +=1
else:
stats['tn'] +=1
In [ ]:
def check_upper(text):
for c in text:
if c.isupper():
return True
return False
In [ ]:
for tweet in tweet_corpus.values():
tweet["gram_set"] = set()
for gram_set in tweet["ngrams"].values():
tweet["gram_set"] |= set(gram_set)
tweet["mention_set"] = set([item['mention'] for item in tweet['goldens']])
stats = {"tp":0., "fp":0., "tn":0., "fn":0.}
for tweet in tweet_corpus.values():
print "======="
print tweet['tweet_info']['id']
print tweet['tweet_info']['text']
print tweet['mention_set']
for gram in tweet['gram_set']:
gram_low = gram
if len(gram_low) <= 3:
continue
if gram_low in stop_words:
continue
# if check_upper(gram_low) == False:
elif gram_low in alias_to_entity and check_upper(gram_low):
if gram_low in tweet['mention_set']:
stats['tp'] +=1
else:
stats['fp'] +=1
print gram_low
else:
if gram_low in tweet['mention_set']:
stats['fn'] +=1
else:
stats['tn'] +=1
In [ ]:
stats
In [ ]:
precision = stats['tp']/(stats['tp']+stats['fp'])
recall = stats['tp']/(stats['tp']+stats['fn'])
F = 2*precision*recall/(precision+recall)
In [ ]:
print precision, recall, F
In [ ]:
alias_to_entity['screenwriter']
In [ ]:
recall
In [ ]:
def experiment_alias(tweets, alias_mapper):
length_sum = 0
match = 0
hit = 0
total = 0
for tweet in tweets.values():
goldens = tweet['goldens']
for g in goldens:
total += 1
mention = g['mention']
real_mention = mention
if mention in tweet['cashtag_mapping']:
real_mention = tweet['cashtag_mapping'][mention]['text']
elif mention in tweet['hashtag_mapping']:
real_mention = tweet['hashtag_mapping'][mention]['text']
elif mention in tweet['url_mapping']:
real_mention = tweet['url_mapping'][mention]['url']
elif mention in tweet['usermention_mapping']:
real_mention = tweet['usermention_mapping'][mention]['name']
low = real_mention.lower()
if low in alias_mapper:
match += 1
length_sum += len(alias_mapper[low])
else:
print tweet['tweet_info']['id'], real_mention, "|", g['wiki_title']
print total, match, hit, length_sum
In [ ]:
alias_to_entity_lower = {}
for als in alias_to_entity.keys():
als_lower = als.lower()
if als_lower in alias_to_entity_lower:
alias_to_entity_lower[als_lower] |= alias_to_entity[als]
else:
alias_to_entity_lower[als_lower] = alias_to_entity[als]
In [ ]:
experiment_alias(tweet_corpus, alias_to_entity_lower)
In [ ]:
for s in alias_to_entity_lower:
pass
In [ ]:
15124./1981
exact: 2420 1800
lower_case: 2420 1981
In [ ]: