In [11]:
from twython import Twython
import pandas as pd
from loader import load_paths
import time
from shutil import rmtree, move
from urllib.request import urlopen
from zipfile import ZipFile
import os
import pickle
from loader import load_michigan_tweets
def load_tweet_ids():
""" Reads the tweet IDs and strips away the newline characters. Returns a list of tweet IDs. """
with open("data/michigan/michigan_tweet_ids.txt", 'r') as f:
tweet_ids = f.readlines()
tweet_ids = [tweet_id.strip() for tweet_id in tweet_ids]
return tweet_ids
def access_twitter_api():
""" Gets API keys from a hardcoded path and returns an initialized Twython twitter API object. """
with open("../keys/twitter.txt", 'r') as f:
CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN = f.readlines()
twitter_api = Twython(CONSUMER_KEY, access_token = ACCESS_TOKEN)
return twitter_api
def get_tweets_from_tweet_ids(twitter_api, tweet_ids):
""" Ask Twitter to return all tweest in our list of tweet_ids. We can ask for 30,000 tweets every 15 minutes. """
# applications are limited to retrieving 30,000 tweets every 15 minutes (not including a couple of minutes as buffer)
# (100 tweets per query, 300 queries every 15 minutes according to https://developer.twitter.com/en/docs/basics/rate-limits)
# The Michigan dataset has 142,249 tweets. It takes ~90 minutes to run the 1,423 queries necessary to retreive all of them.
# Howeer, only 63,245 of the tweets are still available. The deleted tweets were probably disproportionately junk news.
print("Attempting to download {} tweets.".format(len(tweet_ids)))
TWEET_IDS_PER_QUERY = 100
QUERIES_PER_WINDOW = 300
LENGTH_OF_WINDOW = 15 # in minutes
# chunk tweet ids into groups according to how many tweet ids can fit in each query
tweet_id_chunks = []
for i in range(0, len(tweet_ids), TWEET_IDS_PER_QUERY):
# syntax handles the special case of the last chunk, which may have fewer than 100 items, gracefully.
tweet_id_chunks.append(tweet_ids[i:i + TWEET_IDS_PER_QUERY])
# chunk those chunks into groups according to how many queries you can run during each time window
chunks_of_tweet_id_chunks = []
for i in range(0, len(tweet_id_chunks), QUERIES_PER_WINDOW):
# syntax handles the special case of the last chunk, which may have fewer than 300 items, gracefully.
chunks_of_tweet_id_chunks.append(tweet_id_chunks[i:i + QUERIES_PER_WINDOW])
all_tweets = []
# retrieve the max number of tweets you can, wait for the next time window, and repeat until done
# in the future, we could write the tweets to our database while we wait
for i, chunk_of_tweet_id_chunks in enumerate(chunks_of_tweet_id_chunks):
# wait 15 minutes + some buffer (sleep takes seconds) between chunks of calls
if i != 0:
time.sleep(17 * 60)
for tweet_id_chunk in chunk_of_tweet_id_chunks:
tweets = twitter_api.lookup_status(id = tweet_id_chunk) # each tweet is a dictionary. tweets is a list of dictionaries.
all_tweets += tweets
print("Downloaded {} tweets.".format(len(all_tweets)))
print("Finished downloading chunk {} of {} (chunk id {}). {} tweets downloaded so far.".format(i + 1, len(chunks_of_tweet_id_chunks), i, len(all_tweets)))
print("Finished downloading {} tweets!".format(len(all_tweets)))
return all_tweets
def sort_tweet_ids(tweet_ids, downloaded_tweets):
undownloaded_tweet_ids = []
downloaded_tweet_ids = [tweet["id_str"] for tweet in downloaded_tweets]
assert(len(downloaded_tweet_ids) == len(downloaded_tweets))
undownloaded_tweet_ids = [tweet_id for tweet_id in tweet_ids if tweet_id not in downloaded_tweet_ids]
assert(len(downloaded_tweet_ids) + len(undownloaded_tweet_ids) == len(tweet_ids))
return downloaded_tweet_ids, undownloaded_tweet_ids
# downloaded_tweet_ids, undownloaded_tweet_ids = sort_tweet_ids(tweet_ids, downloaded_tweets)
In [12]:
def load_tweets():
print("Loading data...")
with open("data.pkl", "rb") as f:
downloaded_tweets = pickle.load(f)
print("len(downloaded_tweets): {}".format(len(downloaded_tweets)))
return downloaded_tweets
def fn(tweet_ids):
downloaded_tweets = load_tweets()
downloaded_tweet_ids, undownloaded_tweet_ids = sort_tweet_ids(tweet_ids, downloaded_tweets)
print("len(downloaded_tweet): {}".format(len(downloaded_tweets)))
twitter_api = access_twitter_api()
new_downloaded_tweets = get_tweets_from_tweet_ids(twitter_api, undownloaded_tweet_ids)
print("len(new_downloaded_tweets): {}".format(len(new_downloaded_tweets)))
downloaded_tweets += new_downloaded_tweets
downloaded_tweet_ids, undownloaded_tweet_ids = sort_tweet_ids(tweet_ids, downloaded_tweets)
print("len(downloaded_tweet_ids): {}".format(len(downloaded_tweet_ids)))
print("len(undownloaded_tweet_ids: {}".format(len(undownloaded_tweet_ids)))
print("Saving data...")
pickle.dump(downloaded_tweets, open("data.pkl","wb"), protocol=2)
print("Saved data.")
return
tweet_ids = load_tweet_ids()
In [34]:
fn(tweet_ids)
In [15]:
downloaded_tweets = load_tweets()
print(len(downloaded_tweets))
In [17]:
downloaded_tweets = load_tweets()
print(len(downloaded_tweets))
In [19]:
downloaded_tweets = load_tweets()
print(len(downloaded_tweets))
In [21]:
downloaded_tweets = load_tweets()
print(len(downloaded_tweets))
In [23]:
downloaded_tweets = load_tweets()
print(len(downloaded_tweets))
In [25]:
downloaded_tweets = load_tweets()
print(len(downloaded_tweets))
In [28]:
downloaded_tweets = load_tweets()
print(len(downloaded_tweets))
In [30]:
downloaded_tweets = load_tweets()
print(len(downloaded_tweets))
In [33]:
downloaded_tweets = load_tweets()
print(len(downloaded_tweets))
In [35]:
downloaded_tweets = load_tweets()
print(len(downloaded_tweets))
In [ ]: