In [1]:
import pyprind
import sys
import pickle
In [2]:
data_folder = "/Volumes/backup/ccg_tweet_wikifier_data/"
In [ ]:
# Set API keys
consumer_key = 'FTglCRcahFJxH3U0TVoakD3BS'
consumer_secret = 'BoNfyAic70hWkmH6jIzr4xoE48iFyIRnqvwE4NjSwpfJVwD7N2'
access_token = '3081709104-XeNjGf7h9l6G1ERuPE1l9KOX85XJcWMBflWd1P8'
access_token_secret = '9Z1czrN0jU4hs08KtxE7IeIcjNWAd1LbWqblvyvzJoRfN'
In [ ]:
# Set up API
import tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
label-trainDev.tsv -- train split of NEEL [2] dataset
label-train.tsv -- train split of label-trainDev.tsv used in this paper (used for parameter tuning)
label-dev.tsv -- dev split of label-trainDev.tsv used in this paper (used for parameter tuning)
label-test.tsv -- test split of NEEL [2] dataset
In [32]:
file_list = ["label-trainDev.tsv", "label-train.tsv", "label-dev.tsv", "label-test.tsv"]
Get a set of unique tweet ids
In [ ]:
tweet_ids = set()
for file_name in file_list:
with open ("./data/v1-NEELOnly/" + file_name, "r") as f:
for line in f:
segs = line.split("\t")
tweet_id = segs[0]
tweet_ids.add(tweet_id)
Define a function to retrieve tweets
In [ ]:
def fill_tweet_set(id_list):
# set up progress bar
bar = pyprind.ProgBar(len(id_list))
tweet_set = {}
failed = {}
for id in id_list:
bar.update()
try:
tweet_set[id] = api.get_status(id,)._json
except:
failed[id] = sys.exc_info()[0]
return (tweet_set, failed)
In [ ]:
result = fill_tweet_set(tweet_ids)
save the tweet set to file
In [ ]:
with open(data_folder+"Tweet/NEEL_tweets(raw).pickle", "wb") as f:
pickle.dump(result, f)
In [ ]:
tweet_corpus = {}
for tweet_id in result[0].keys():
tweet_corpus[tweet_id] = {}
tweet_corpus[tweet_id]['tweet_info'] = result[0][tweet_id]
with open(data_folder+"Tweet/NEEL_tweets(initial).pickle", "wb") as f:
pickle.dump(tweet_corpus, f)
A sample tweet object in json
In [ ]:
tweet_corpus['93314579924393984']
In [28]:
with open(data_folder+"Tweet/NEEL_tweets(initial).pickle", "rb") as f:
tweet_corpus = pickle.load(f)
In [29]:
def golden_generator(tid, s_idx, e_idx, wiki_title, freebase, mention):
ret = {}
ret['tid'] = tid
ret['start_idx'] = s_idx
ret['end_idx'] = e_idx
ret['wiki_title'] = wiki_title
ret['freebase_title'] = freebase
ret['mention'] = mention
return ret
In [30]:
def golden_standard(file_list, tweet_corpus):
unique_lines= set()
for file_name in file_list:
with open ("./data/v1-NEELOnly/" + file_name, "r") as f:
for line in f:
unique_lines.add(line.strip())
for tw in tweet_corpus.values():
tw['goldens'] = []
for line in unique_lines:
segments = line.strip().split("\t")
tid = segments[0]
if tid in tweet_corpus:
golden = golden_generator(segments[0], segments[1], segments[2],\
segments[3], segments[4], segments[5])
tweet_corpus[tid]['goldens'].append(golden)
Now loading ...
In [33]:
golden_standard(file_list, tweet_corpus)
In [54]:
for tweet in tweet_corpus.values():
text = tweet['tweet_info']['text']
for g in tweet['goldens']:
g['mention_orig'] = text.encode('utf-8')[int(g['start_idx']):int(g['end_idx'])]
This is an example of golden standard
In [55]:
tweet_corpus['93314579924393984']['goldens']
Out[55]:
Let's see an example:
In [56]:
print tweet_corpus['93141776474456064']['tweet_info']['entities']['hashtags']
In [57]:
def make_hashtag_mapping(tweet):
ret = {}
for tag in tweet['tweet_info']['entities']['hashtags']:
ret[tag['text']] = tag
return ret
let's see an example
In [58]:
print tweet_corpus['93141776474456064']['tweet_info']['entities']['user_mentions']
In [59]:
def make_usermention_mapping(tweet):
ret = {}
for name in tweet['tweet_info']['entities']['user_mentions']:
ret[name['screen_name']] = name
return ret
In [60]:
print tweet_corpus['100982604374880256']['tweet_info']['entities']['symbols']
In [61]:
def make_cashtag_mapping(tweet):
ret = {}
for tag in tweet['tweet_info']['entities']['symbols']:
ret[tag['text']] = tag
return ret
In [62]:
print tweet_corpus['100982604374880256']['tweet_info']['entities']['urls']
In [63]:
def make_url_mapping(tweet):
ret = {}
for url in tweet['tweet_info']['entities']['urls']:
ret[url['url']] = url
return ret
In [64]:
for tweet in tweet_corpus.values():
tweet['hashtag_mapping'] = make_hashtag_mapping(tweet)
tweet['usermention_mapping'] = make_usermention_mapping(tweet)
tweet['cashtag_mapping'] = make_cashtag_mapping(tweet)
tweet['url_mapping'] = make_url_mapping(tweet)
In [65]:
with open(data_folder+"Tweet/NEEL_tweets(with_tw).pickle", "wb") as f:
pickle.dump(tweet_corpus, f)
loading tweets
In [66]:
with open(data_folder+"Tweet/NEEL_tweets(with_tw).pickle", "rb") as f:
tweet_corpus = pickle.load(f)
In [67]:
from corenlp import *
corenlp = StanfordCoreNLP()
In [68]:
def stanford_parse(corenlp, text):
"""
transform the output into __dict__
"""
return json.loads(corenlp.parse(text))
In [69]:
bar = pyprind.ProgBar(len(tweet_corpus), width = 70)
for tweet in tweet_corpus.values():
bar.update()
text = tweet['tweet_info']['text']
text = text.replace("\n", " ")
text = text.replace("\t", " ")
# tweet['stanford_parsed_text'] = corenlp.parse(text)
tweet['stanford_parsed'] = stanford_parse(corenlp, text)
In [12]:
text = tweet_corpus['94518623552552961']['stanford_parsed_text'].replace("\n", " ")
In [21]:
tweet_corpus['94518623552552961']['tweet_info']['text']
Out[21]:
In [24]:
tweet_corpus['94518623552552961']['ngrams'][2]
Out[24]:
In [25]:
tweet_corpus['94518623552552961']['goldens']
Out[25]:
In [26]:
tweet_corpus['94518623552552961']['tweet_info']['text']
Out[26]:
In [103]:
# helper functions
def remove_zero(inlist):
for ele in inlist:
if len(ele) == 0:
inlist.remove(ele)
def gram_to_string(segments, sentence_text):
"""
return the surface givan n_gram
"""
start_idx = segments[0][1]['CharacterOffsetBegin']
end_idx = segments[-1][1]['CharacterOffsetEnd']
text = sentence_text[int(start_idx): int(end_idx)].encode('UTF-8')
return (text, start_idx, end_idx)
def n_grams_helper(parsed, n, sentence_text):
"""
wrapper for previous function
"""
ret = []
text = parsed['text']
words = parsed['words']
length = len(words)
if length<n:
return []
for idx in range(0, length-n+1):
segments = words[idx : idx+n]
ret.append(gram_to_string(segments,sentence_text))
return ret
In [109]:
def n_grams_generator(stanford_core, text, max_n, parsed):
grams = {}
# parsed = stanford_parse(stanford_core, text)
sentences = parsed['sentences']
for n in range(1, max_n):
grams[n] = []
for sen in sentences:
grams[n].extend(n_grams_helper(sen, n, text))
remove_zero(grams[n])
return grams
In [120]:
bar = pyprind.ProgBar(len(tweet_corpus), width = 70)
for tweet in tweet_corpus.values():
bar.update()
tweet['ngrams'] = n_grams_generator(corenlp, tweet['tweet_info']['text'],\
100, tweet['stanford_parsed'])
In [121]:
# n_grams_reprocess:
for tweet in tweet_corpus.values():
for n in tweet['ngrams'].keys():
for onegram in tweet['ngrams'][n]:
if onegram.count("/") == 1:
if "http" in onegram:
continue
segments = onegram.split("/")
remove_zero(segments)
tweet['ngrams'][n].extend(segments)
if onegram.count("-") == 1:
segments = onegram.split("-")
remove_zero(segments)
tweet['ngrams'][n].extend(segments)
if onegram.count("?") == 1:
segments = onegram.split("?")
remove_zero(segments)
tweet['ngrams'][n].extend(segments)
if onegram.count(".") == 1:
segments = onegram.split(".")
remove_zero(segments)
tweet['ngrams'][n].extend(segments)
remove_zero(tweet['ngrams'][n])
In [122]:
with open(data_folder+"Tweet/NEEL_tweets(with_grams).pickle", "wb") as f:
pickle.dump(tweet_corpus, f)
In [123]:
tweet_corpus['92955019615272961']
Out[123]:
In [87]:
m.remove("dasd")
In [86]:
m
Out[86]:
In [ ]: