SEMEVAL 2016

Data Download

Download all the data for SEMEVAL competition

Data details at: http://alt.qcri.org/semeval2016/task4/index.php?id=data-and-tools

Download the file http://alt.qcri.org/semeval2016/task4/data/uploads/semeval2016-task4.traindevdevtest.v1.2.zip

wget http://alt.qcri.org/semeval2016/task4/data/uploads/semeval2016-task4.traindevdevtest.v1.2.zip
cd data
unzip semeval2016-task4.traindevdevtest.v1.2.zip

Extract all the tweet ids from all the files:

find ./data/ -name "*.txt" | grep -v README | xargs cut -f1 | sort | uniq > tweet_ids.txt

Install tweepy using pip install tweepy


In [1]:
import json
import tweepy as tpy
import pprint as pp
import time
import datetime
import traceback

In [2]:
# Load the authentication codes from the file
authentication_file = "auth_keys.json"
authentication_codes = json.load(open(authentication_file))

In [3]:
key_missing = lambda k,d: (k not in d) or d[k] == ""
access_keys = ["access_token", "access_token_secret"]
consumer_keys = ["consumer_key", "consumer_secret"]

assert sum(key_missing(k, authentication_codes) for k in access_keys) == 0,"Please set consumer key and consumer secret in %s" % authentication_file
auth = tpy.OAuthHandler(authentication_codes[consumer_keys[0]], authentication_codes[consumer_keys[1]])
if sum(key_missing(k, authentication_codes) for k in access_keys):
    # Start oauth dance and get the verification URL:
    try:
        redirect_url = auth.get_authorization_url()
    except tpy.TweepError:
        print 'Error! Failed to get request token.'
    print "Go to the following URL: \n", redirect_url
    verifier = raw_input("Enter the verification code: ")
    try:
        auth.get_access_token(verifier)
        authentication_codes[access_keys[0]] = auth.access_token
        authentication_codes[access_keys[1]] = auth.access_token_secret
        print "Got new access token details. Saving to file: %s" % authentication_file
        json.dump(authentication_codes, open(authentication_file, "wb+"), indent=4)
    except tpy.TweepError:
        print 'Error! Failed to get access token.'
else:
    print "Setting already existing authentication codes."
    auth.set_access_token(authentication_codes[access_keys[0]], authentication_codes[access_keys[1]])

print "Initializing tweepy API"
api = tpy.API(auth)


Setting already existing authentication codes.
Initializing tweepy API

In [ ]:
TWEETS_TEXT_FILE = "TWEET_TEXT.txt"
TWEETS_DATA_FILE = "TWEET_DATA.json"
TWEETS_IDS_FILE = "tweet_ids.txt"

try:
    tweets_data = json.load(open(TWEETS_DATA_FILE))
    # Save backup of old data:
    print "Saving backup data in %s" % (TWEETS_DATA_FILE+".backup")
    json.dump(tweets_data, open(TWEETS_DATA_FILE+".backup", "wb+"))
except:
    print "Either file %s doesn't exist or error reading file. Will create new file." % TWEETS_DATA_FILE
    tweets_data = {}

total, existing, new_downloaded, not_available = 0, 0, 0, 0
text = ""
with open(TWEETS_IDS_FILE) as tid_fp, open(TWEETS_TEXT_FILE, "wb+") as ttxt_fp:
    for i, tid in enumerate(tid_fp.readlines()):
        if (i % 50) == 0:
            print "Finished reading %s lines. Next TID: %s" % (i, tid)
        tid = tid[:-1]
        if tid in tweets_data:
            text = tweets_data[tid]["text"].replace("\n", " ").replace("\r", " ")
            existing += 1
            total += 1
        while tid not in tweets_data:
            try:
                print "Trying to download: %s, %s" % (i, tid)
                status = api.get_status(tid)
                print "Download success"
                tweets_data[tid] = status._json
                text = status.text.replace("\n", " ").replace("\r", " ")
                new_downloaded += 1
                total += 1
            except tpy.RateLimitError:
                rate = api.rate_limit_status()
                reset = rate['resources']['statuses']['/statuses/show/:id']['reset']
                now = datetime.datetime.today()
                future = datetime.datetime.fromtimestamp(reset)
                seconds = (future-now).seconds+1
                if seconds < 10000:
                    print "Rate limit exceeded, sleeping for %s seconds until %s\n Will resume downloading: %s" % (seconds, future, tid)
                    print "Finished downloading %s tweets. Total: %s, Existing: %s, Not Available: %s" % (new_downloaded, total, existing, not_available)
                    print "Writing intermediate data with %s tweets to file: %s" % (len(tweets_data), TWEETS_DATA_FILE+".middle")
                    with open(TWEETS_DATA_FILE+".middle", "wb+") as fp_temp:
                        json.dump(tweets_data, fp_temp)
                    print "Finished writing. Now sleeping for %s seconds." % seconds
                    time.sleep(seconds)
                    print "Resume downloading: %s at %s" % (tid, datetime.datetime.today())
            except tpy.TweepError as e:
                print "Encountered error: ", e
                if e.api_code in [34, 179, 144, 63]:
                    print "Tweet id: %s not found. Using placeholder text" % tid
                    text = "Not Available"
                    tweets_data[tid] = {"text": text}
                    not_available += 1
                    total += 1
                else:
                    print "Encountered some other error. Follow stack trace:\n", traceback.format_exc()
                    raise
            except:
                print "Encountered some other error. Follow stack trace:\n", traceback.format_exc()
                raise
        print "Writng to %s to text file %s." % (tid, TWEETS_TEXT_FILE)
        print >> ttxt_fp, "%s\t%s" % (tid, text)
        if new_downloaded % 100 == 0 and new_downloaded > 1:
            print "Finished downloading %s tweets. Total: %s, Existing: %s, Not Available: %s" % (new_downloaded, total, existing, not_available)

print "Finished downloading all tweets. New: %s, Total: %s, Existing: %s, Not Available: %s" % (new_downloaded, total, existing, not_available)
print "Writing new data with %s tweets to file: %s" % (len(tweets_data), TWEETS_DATA_FILE)
json.dump(tweets_data, open(TWEETS_DATA_FILE, "wb+"))

In [4]:
! tail TWEET_TEXT.txt


629034528141647872	Last day to see Jurassic World tomorrow and I kinda wanna see it I mean it is Ty's birthday so it makes sense to go again
629035692450906112	I get way too hype about October fuck February Halloween is my Valentine's Day
629037053343305728	Flying to Paris tomorrow morning! First on the agenda is have a croissant and some frog's legs with Zlatan on the Eiffel tower.
629037957538672641	@BestBuy - What time does the Apple Watch go on sale this Friday?
629038344349970432	Bobby Jindal Misses Cut for 1st Prime-Time Presidential Debate: Indian-American presidential hopeful Bobby Jindal could not make it t...
629048170333405184	@noshoesnation 11th KC show, 6 different states! Would love to get my No Shoes Nation flag signed for my 20th birthday! See you at MetLife!
629048875156959232	Okay going to bed now, hope i can sleep this time ! Goodnight xx i'll see what happened at Metlife tomorrow!! Have fun y'all!! X
629050880332050432	Not Available
629051270528172032	i wish you was at MetLife:( https://t.co/pmy4uNecTI
6290521651

In [ ]: