Data Preparation

Jiarui Xu - jxu57@illinois.edu


In [1]:
import pyprind
import sys
import pickle

In [2]:
data_folder = "/Volumes/backup/ccg_tweet_wikifier_data/"

1. Acquire Tweets

a. Set Twitter API


In [ ]:
# Set API keys
consumer_key = 'FTglCRcahFJxH3U0TVoakD3BS'
consumer_secret = 'BoNfyAic70hWkmH6jIzr4xoE48iFyIRnqvwE4NjSwpfJVwD7N2'
access_token = '3081709104-XeNjGf7h9l6G1ERuPE1l9KOX85XJcWMBflWd1P8'
access_token_secret = '9Z1czrN0jU4hs08KtxE7IeIcjNWAd1LbWqblvyvzJoRfN'

In [ ]:
# Set up API
import tweepy

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

b. Load File

label-trainDev.tsv -- train split of NEEL [2] dataset
label-train.tsv -- train split of label-trainDev.tsv used in this paper (used for parameter tuning)
label-dev.tsv -- dev split of label-trainDev.tsv used in this paper (used for parameter tuning)
label-test.tsv -- test split of NEEL [2] dataset


In [32]:
file_list = ["label-trainDev.tsv", "label-train.tsv", "label-dev.tsv", "label-test.tsv"]

Get a set of unique tweet ids


In [ ]:
tweet_ids = set()
for file_name in file_list:
    with open ("./data/v1-NEELOnly/" + file_name, "r") as f:
        for line in f:
            segs = line.split("\t")
            tweet_id = segs[0]
            tweet_ids.add(tweet_id)

c. Retrieve Tweets using API

Define a function to retrieve tweets


In [ ]:
def fill_tweet_set(id_list):
    # set up progress bar
    bar = pyprind.ProgBar(len(id_list))
    
    tweet_set = {}
    failed = {}
    
    for id in id_list:
        bar.update()
        
        try:
            tweet_set[id] = api.get_status(id,)._json
        except:
            failed[id] = sys.exc_info()[0]
    
    return (tweet_set, failed)

In [ ]:
result = fill_tweet_set(tweet_ids)

save the tweet set to file


In [ ]:
with open(data_folder+"Tweet/NEEL_tweets(raw).pickle", "wb") as f:
    pickle.dump(result, f)

In [ ]:
tweet_corpus = {}

for tweet_id in result[0].keys():
    tweet_corpus[tweet_id] = {}
    tweet_corpus[tweet_id]['tweet_info'] = result[0][tweet_id]

with open(data_folder+"Tweet/NEEL_tweets(initial).pickle", "wb") as f:
    pickle.dump(tweet_corpus, f)

A sample tweet object in json


In [ ]:
tweet_corpus['93314579924393984']

2. Load Golden Standard


In [28]:
with open(data_folder+"Tweet/NEEL_tweets(initial).pickle", "rb") as f:
    tweet_corpus = pickle.load(f)

In [29]:
def golden_generator(tid, s_idx, e_idx, wiki_title, freebase, mention):
    ret = {}
    ret['tid'] = tid
    ret['start_idx'] = s_idx
    ret['end_idx'] = e_idx
    ret['wiki_title'] = wiki_title
    ret['freebase_title'] = freebase
    ret['mention'] = mention
    return ret

In [30]:
def golden_standard(file_list, tweet_corpus):
    unique_lines= set()
    
    for file_name in file_list: 
        with open ("./data/v1-NEELOnly/" + file_name, "r") as f:
            for line in f:
                unique_lines.add(line.strip())
    
    for tw in tweet_corpus.values():
        tw['goldens'] = []
        
    for line in unique_lines:
        segments = line.strip().split("\t")
        tid = segments[0]
        if tid in tweet_corpus:
            golden = golden_generator(segments[0], segments[1], segments[2],\
                                        segments[3], segments[4], segments[5])
            tweet_corpus[tid]['goldens'].append(golden)

Now loading ...


In [33]:
golden_standard(file_list, tweet_corpus)

In [54]:
for tweet in tweet_corpus.values():
    text = tweet['tweet_info']['text']
    for g in tweet['goldens']:
        g['mention_orig'] = text.encode('utf-8')[int(g['start_idx']):int(g['end_idx'])]

This is an example of golden standard


In [55]:
tweet_corpus['93314579924393984']['goldens']


Out[55]:
[{'end_idx': '10',
  'freebase_title': '/m/02_nkp',
  'mention': 'Lamar Odom',
  'mention_orig': 'Lamar Odom',
  'start_idx': '0',
  'tid': '93314579924393984',
  'wiki_title': 'Lamar_Odom'}]

3. Extracting Twitter's Special Information

a. Hashtag

Let's see an example:


In [56]:
print tweet_corpus['93141776474456064']['tweet_info']['entities']['hashtags']


[{u'indices': [14, 26], u'text': u'TheBachelor'}]

In [57]:
def make_hashtag_mapping(tweet):
    ret = {}
    for tag in tweet['tweet_info']['entities']['hashtags']:
        ret[tag['text']] = tag
    return ret

b. User Mention

let's see an example


In [58]:
print tweet_corpus['93141776474456064']['tweet_info']['entities']['user_mentions']


[{u'indices': [113, 128], u'id_str': u'17890396', u'screen_name': u'chrisbharrison', u'name': u'Chris Harrison', u'id': 17890396}]

In [59]:
def make_usermention_mapping(tweet):
    ret = {}
    for name in tweet['tweet_info']['entities']['user_mentions']:
        ret[name['screen_name']] = name
    return ret

c. Cashtag


In [60]:
print tweet_corpus['100982604374880256']['tweet_info']['entities']['symbols']


[{u'indices': [105, 110], u'text': u'AAPL'}]

In [61]:
def make_cashtag_mapping(tweet):
    ret = {}
    for tag in tweet['tweet_info']['entities']['symbols']:
        ret[tag['text']] = tag
    return ret

d. URL


In [62]:
print tweet_corpus['100982604374880256']['tweet_info']['entities']['urls']


[{u'url': u'http://t.co/vOtnNjD', u'indices': [85, 104], u'expanded_url': u'http://reut.rs/pJSj5o', u'display_url': u'reut.rs/pJSj5o'}]

In [63]:
def make_url_mapping(tweet):
    ret = {}
    for url in tweet['tweet_info']['entities']['urls']:
        ret[url['url']] = url
    return ret

Iterate throught the tweets


In [64]:
for tweet in tweet_corpus.values():
    tweet['hashtag_mapping'] = make_hashtag_mapping(tweet)
    tweet['usermention_mapping'] = make_usermention_mapping(tweet)
    tweet['cashtag_mapping'] = make_cashtag_mapping(tweet)
    tweet['url_mapping'] = make_url_mapping(tweet)

In [65]:
with open(data_folder+"Tweet/NEEL_tweets(with_tw).pickle", "wb") as f:
    pickle.dump(tweet_corpus, f)

loading tweets


In [66]:
with open(data_folder+"Tweet/NEEL_tweets(with_tw).pickle", "rb") as f:
    tweet_corpus = pickle.load(f)

4. Text Parsing

a. Stanford NLP


In [67]:
from corenlp import *
corenlp = StanfordCoreNLP()


Loading Models: 5/5                                                            

In [68]:
def stanford_parse(corenlp, text):
    """
    transform the output into __dict__
    """
    return json.loads(corenlp.parse(text))

In [69]:
bar = pyprind.ProgBar(len(tweet_corpus), width = 70)

for tweet in tweet_corpus.values():
    bar.update()
    text = tweet['tweet_info']['text']
    
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")

    # tweet['stanford_parsed_text'] = corenlp.parse(text)
    tweet['stanford_parsed'] = stanford_parse(corenlp, text)


0%                                                                  100%
[######################################################################] | ETA: 00:00:00
Total time elapsed: 00:10:38

In [12]:
text = tweet_corpus['94518623552552961']['stanford_parsed_text'].replace("\n", " ")

In [21]:
tweet_corpus['94518623552552961']['tweet_info']['text']


Out[21]:
u'Police: The Ut\xf8ya shooter is a 32 year old Norwegian national. His ethnicity has NOT been disclosed. Interrogation is currently taking place'

In [24]:
tweet_corpus['94518623552552961']['ngrams'][2]


Out[24]:
[u'Police:',
 u': The',
 u'The Ut\xf8ya',
 u'Ut\xf8ya shooter',
 u'shooter is',
 u'is a',
 u'a 32',
 u'32 year',
 u'year old',
 u'old Norwegian',
 u'Norwegian national',
 u'national.',
 u'His ethnicity',
 u'ethnicity has',
 u'has NOT',
 u'NOT been',
 u'been disclosed',
 u'disclosed.',
 u'Interrogation is',
 u'is currently',
 u'currently taking',
 u'taking place']

In [25]:
tweet_corpus['94518623552552961']['goldens']


Out[25]:
[{'end_idx': '25',
  'freebase_title': '/m/0h1fpkg',
  'mention': 'Ut\xc3\xb8ya shooter',
  'start_idx': '12',
  'tid': '94518623552552961',
  'wiki_title': 'Anders_Behring_Breivik'}]

In [26]:
tweet_corpus['94518623552552961']['tweet_info']['text']


Out[26]:
u'Police: The Ut\xf8ya shooter is a 32 year old Norwegian national. His ethnicity has NOT been disclosed. Interrogation is currently taking place'

5. N-gram generation


In [103]:
# helper functions

def remove_zero(inlist):
    for ele in inlist:
        if len(ele) == 0:
            inlist.remove(ele)

def gram_to_string(segments, sentence_text):
    """
    return the surface givan n_gram
    """
    start_idx = segments[0][1]['CharacterOffsetBegin']
    end_idx = segments[-1][1]['CharacterOffsetEnd']
    text = sentence_text[int(start_idx): int(end_idx)].encode('UTF-8')
    return (text, start_idx, end_idx)
    
def n_grams_helper(parsed, n, sentence_text):
    """
    wrapper for previous function
    """
    ret = []
    
    text = parsed['text']
    words = parsed['words']
    length = len(words)
    
    if length<n:
        return []
    
    for idx in range(0, length-n+1):
        segments = words[idx : idx+n]
        ret.append(gram_to_string(segments,sentence_text))
    return ret

In [109]:
def n_grams_generator(stanford_core, text, max_n, parsed):
    grams = {}

    # parsed = stanford_parse(stanford_core, text)
    sentences = parsed['sentences']
    
    for n in range(1, max_n):
        grams[n] = []
        for sen in sentences:
            grams[n].extend(n_grams_helper(sen, n, text))
        remove_zero(grams[n])
    return grams

In [120]:
bar = pyprind.ProgBar(len(tweet_corpus), width = 70)

for tweet in tweet_corpus.values():
    bar.update()
    tweet['ngrams'] = n_grams_generator(corenlp, tweet['tweet_info']['text'],\
                                                    100, tweet['stanford_parsed'])


0%                                                                  100%
[######################################################################] | ETA: 00:00:00
Total time elapsed: 00:00:01

In [121]:
# n_grams_reprocess:

for tweet in tweet_corpus.values():
    for n in tweet['ngrams'].keys():
        for onegram in tweet['ngrams'][n]:
            if onegram.count("/") == 1:
                if "http" in onegram:
                    continue
                segments = onegram.split("/")
                remove_zero(segments)
                tweet['ngrams'][n].extend(segments)

            if onegram.count("-") == 1:
                segments = onegram.split("-")
                remove_zero(segments)
                tweet['ngrams'][n].extend(segments)

            if onegram.count("?") == 1:
                segments = onegram.split("?")
                remove_zero(segments)
                tweet['ngrams'][n].extend(segments)
            
            if onegram.count(".") == 1:
                segments = onegram.split(".")
                remove_zero(segments)
                tweet['ngrams'][n].extend(segments)
                
            remove_zero(tweet['ngrams'][n])

In [122]:
with open(data_folder+"Tweet/NEEL_tweets(with_grams).pickle", "wb") as f:
    pickle.dump(tweet_corpus, f)

In [123]:
tweet_corpus['92955019615272961']


Out[123]:
{'cashtag_mapping': {},
 'goldens': [{'end_idx': '36',
   'freebase_title': '/m/02pg7',
   'mention': 'Eph',
   'mention_orig': 'Eph',
   'start_idx': '33',
   'tid': '92955019615272961',
   'wiki_title': 'Epistle_to_the_Ephesians'}],
 'hashtag_mapping': {},
 'ngrams': {1: ['I',
   'put',
   'on',
   'the',
   'Full',
   'Armor',
   'of',
   'God',
   '-',
   'Eph',
   '6:10'],
  2: ['I put',
   'put on',
   'on the',
   'the Full',
   'Full Armor',
   'Armor of',
   'of God',
   'God -',
   '- Eph',
   'Eph 6:10',
   'God ',
   ' Eph'],
  3: ['I put on',
   'put on the',
   'on the Full',
   'the Full Armor',
   'Full Armor of',
   'Armor of God',
   'of God -',
   'God - Eph',
   '- Eph 6:10',
   'of God ',
   'God ',
   ' Eph',
   ' Eph 6:10'],
  4: ['I put on the',
   'put on the Full',
   'on the Full Armor',
   'the Full Armor of',
   'Full Armor of God',
   'Armor of God -',
   'of God - Eph',
   'God - Eph 6:10',
   'Armor of God ',
   'of God ',
   ' Eph',
   'God ',
   ' Eph 6:10'],
  5: ['I put on the Full',
   'put on the Full Armor',
   'on the Full Armor of',
   'the Full Armor of God',
   'Full Armor of God -',
   'Armor of God - Eph',
   'of God - Eph 6:10',
   'Full Armor of God ',
   'Armor of God ',
   ' Eph',
   'of God ',
   ' Eph 6:10'],
  6: ['I put on the Full Armor',
   'put on the Full Armor of',
   'on the Full Armor of God',
   'the Full Armor of God -',
   'Full Armor of God - Eph',
   'Armor of God - Eph 6:10',
   'the Full Armor of God ',
   'Full Armor of God ',
   ' Eph',
   'Armor of God ',
   ' Eph 6:10'],
  7: ['I put on the Full Armor of',
   'put on the Full Armor of God',
   'on the Full Armor of God -',
   'the Full Armor of God - Eph',
   'Full Armor of God - Eph 6:10',
   'on the Full Armor of God ',
   'the Full Armor of God ',
   ' Eph',
   'Full Armor of God ',
   ' Eph 6:10'],
  8: ['I put on the Full Armor of God',
   'put on the Full Armor of God -',
   'on the Full Armor of God - Eph',
   'the Full Armor of God - Eph 6:10',
   'put on the Full Armor of God ',
   'on the Full Armor of God ',
   ' Eph',
   'the Full Armor of God ',
   ' Eph 6:10'],
  9: ['I put on the Full Armor of God -',
   'put on the Full Armor of God - Eph',
   'on the Full Armor of God - Eph 6:10',
   'I put on the Full Armor of God ',
   'put on the Full Armor of God ',
   ' Eph',
   'on the Full Armor of God ',
   ' Eph 6:10'],
  10: ['I put on the Full Armor of God - Eph',
   'put on the Full Armor of God - Eph 6:10',
   'I put on the Full Armor of God ',
   ' Eph',
   'put on the Full Armor of God ',
   ' Eph 6:10'],
  11: ['I put on the Full Armor of God - Eph 6:10',
   'I put on the Full Armor of God ',
   ' Eph 6:10'],
  12: [],
  13: [],
  14: [],
  15: [],
  16: [],
  17: [],
  18: [],
  19: [],
  20: [],
  21: [],
  22: [],
  23: [],
  24: [],
  25: [],
  26: [],
  27: [],
  28: [],
  29: [],
  30: [],
  31: [],
  32: [],
  33: [],
  34: [],
  35: [],
  36: [],
  37: [],
  38: [],
  39: [],
  40: [],
  41: [],
  42: [],
  43: [],
  44: [],
  45: [],
  46: [],
  47: [],
  48: [],
  49: [],
  50: [],
  51: [],
  52: [],
  53: [],
  54: [],
  55: [],
  56: [],
  57: [],
  58: [],
  59: [],
  60: [],
  61: [],
  62: [],
  63: [],
  64: [],
  65: [],
  66: [],
  67: [],
  68: [],
  69: [],
  70: [],
  71: [],
  72: [],
  73: [],
  74: [],
  75: [],
  76: [],
  77: [],
  78: [],
  79: [],
  80: [],
  81: [],
  82: [],
  83: [],
  84: [],
  85: [],
  86: [],
  87: [],
  88: [],
  89: [],
  90: [],
  91: [],
  92: [],
  93: [],
  94: [],
  95: [],
  96: [],
  97: [],
  98: [],
  99: []},
 'stanford_parsed': {u'sentences': [{u'dependencies': [[u'root',
      u'ROOT',
      u'put'],
     [u'nsubj', u'put', u'I'],
     [u'det', u'Armor', u'the'],
     [u'amod', u'Armor', u'Full'],
     [u'prep_on', u'put', u'Armor'],
     [u'prep_of', u'Armor', u'God'],
     [u'dep', u'Armor', u'Eph'],
     [u'num', u'Eph', u'6:10']],
    u'parsetree': u'(ROOT (S (NP (PRP I)) (VP (VBP put) (PP (IN on) (NP (NP (NP (DT the) (JJ Full) (NN Armor)) (PP (IN of) (NP (NNP God)))) (: -) (NP (NN Eph) (CD 6:10)))))))',
    u'text': u'I put on the Full Armor of God - Eph 6:10',
    u'words': [[u'I',
      {u'CharacterOffsetBegin': u'0',
       u'CharacterOffsetEnd': u'1',
       u'Lemma': u'I',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'PRP'}],
     [u'put',
      {u'CharacterOffsetBegin': u'2',
       u'CharacterOffsetEnd': u'5',
       u'Lemma': u'put',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'VBP'}],
     [u'on',
      {u'CharacterOffsetBegin': u'6',
       u'CharacterOffsetEnd': u'8',
       u'Lemma': u'on',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'IN'}],
     [u'the',
      {u'CharacterOffsetBegin': u'9',
       u'CharacterOffsetEnd': u'12',
       u'Lemma': u'the',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'DT'}],
     [u'Full',
      {u'CharacterOffsetBegin': u'13',
       u'CharacterOffsetEnd': u'17',
       u'Lemma': u'full',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'JJ'}],
     [u'Armor',
      {u'CharacterOffsetBegin': u'18',
       u'CharacterOffsetEnd': u'23',
       u'Lemma': u'armor',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'NN'}],
     [u'of',
      {u'CharacterOffsetBegin': u'24',
       u'CharacterOffsetEnd': u'26',
       u'Lemma': u'of',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'IN'}],
     [u'God',
      {u'CharacterOffsetBegin': u'27',
       u'CharacterOffsetEnd': u'30',
       u'Lemma': u'God',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'NNP'}],
     [u'-',
      {u'CharacterOffsetBegin': u'31',
       u'CharacterOffsetEnd': u'32',
       u'Lemma': u'-',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u':'}],
     [u'Eph',
      {u'CharacterOffsetBegin': u'33',
       u'CharacterOffsetEnd': u'36',
       u'Lemma': u'eph',
       u'NamedEntityTag': u'O',
       u'PartOfSpeech': u'NN'}],
     [u'6:10',
      {u'CharacterOffsetBegin': u'37',
       u'CharacterOffsetEnd': u'41',
       u'Lemma': u'6:10',
       u'NamedEntityTag': u'TIME',
       u'NormalizedNamedEntityTag': u'T06:10',
       u'PartOfSpeech': u'CD',
       u'Timex': u'<TIMEX3 tid="t1" type="TIME" value="T06:10">6:10</TIMEX3>'}]]}]},
 'tweet_info': {u'contributors': None,
  u'coordinates': None,
  u'created_at': u'Mon Jul 18 13:52:58 +0000 2011',
  u'entities': {u'hashtags': [],
   u'symbols': [],
   u'urls': [],
   u'user_mentions': []},
  u'favorite_count': 87,
  u'favorited': False,
  u'geo': None,
  u'id': 92955019615272961,
  u'id_str': u'92955019615272961',
  u'in_reply_to_screen_name': None,
  u'in_reply_to_status_id': None,
  u'in_reply_to_status_id_str': None,
  u'in_reply_to_user_id': None,
  u'in_reply_to_user_id_str': None,
  u'is_quote_status': False,
  u'lang': u'en',
  u'place': None,
  u'retweet_count': 415,
  u'retweeted': False,
  u'source': u'<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
  u'text': u'I put on the Full Armor of God - Eph 6:10',
  u'truncated': False,
  u'user': {u'contributors_enabled': False,
   u'created_at': u'Thu Apr 08 00:37:42 +0000 2010',
   u'default_profile': False,
   u'default_profile_image': False,
   u'description': u'Christian Insights,Godly Affirmations, and Quotes from the Bible',
   u'entities': {u'description': {u'urls': []},
    u'url': {u'urls': [{u'display_url': u'facebook.com/LoveLikeJesus',
       u'expanded_url': u'http://www.facebook.com/LoveLikeJesus',
       u'indices': [0, 22],
       u'url': u'http://t.co/SyRevUGyhd'}]}},
   u'favourites_count': 18,
   u'follow_request_sent': False,
   u'followers_count': 1098068,
   u'following': False,
   u'friends_count': 94790,
   u'geo_enabled': False,
   u'has_extended_profile': False,
   u'id': 130681675,
   u'id_str': u'130681675',
   u'is_translation_enabled': False,
   u'is_translator': False,
   u'lang': u'en',
   u'listed_count': 5866,
   u'location': u'',
   u'name': u'Love Like Jesus',
   u'notifications': False,
   u'profile_background_color': u'FCF2EA',
   u'profile_background_image_url': u'http://pbs.twimg.com/profile_background_images/334035182/old-paper-jesus_background-800-3_-_Copy.jpg',
   u'profile_background_image_url_https': u'https://pbs.twimg.com/profile_background_images/334035182/old-paper-jesus_background-800-3_-_Copy.jpg',
   u'profile_background_tile': False,
   u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/130681675/1398329721',
   u'profile_image_url': u'http://pbs.twimg.com/profile_images/1553461504/images-of-jesus-christ-097-2c_normal.jpg',
   u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/1553461504/images-of-jesus-christ-097-2c_normal.jpg',
   u'profile_link_color': u'994F00',
   u'profile_sidebar_border_color': u'FFFFFF',
   u'profile_sidebar_fill_color': u'EADEAA',
   u'profile_text_color': u'333333',
   u'profile_use_background_image': False,
   u'protected': False,
   u'screen_name': u'LovLikeJesus',
   u'statuses_count': 53788,
   u'time_zone': u'Central Time (US & Canada)',
   u'url': u'http://t.co/SyRevUGyhd',
   u'utc_offset': -18000,
   u'verified': False}},
 'url_mapping': {},
 'usermention_mapping': {}}

In [87]:
m.remove("dasd")


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-87-6428fc7e38a1> in <module>()
----> 1 m.remove("dasd")

ValueError: list.remove(x): x not in list

In [86]:
m


Out[86]:
['sda', 'dsa']

In [ ]: