Tweet Scraper
This notebook uses tweepy and a previously made twitter account to scrape every tweet from given twitter accounts. Edit the handles.txt file to change the twitter accounts set to download
In [1]:
import oauth2 as oauth
import urllib.request
from pprint import pprint
import csv
import re
import os
import tweepy
from tweepy import OAuthHandler
import json
In [2]:
keys_path = './twitterkeys.txt'
keys=[]
#load the twitter api keys
with open(keys_path) as f:
keys=json.load(f)
In [3]:
accountToUse = 'Donald Trump'
keysToUse = keys[accountToUse]
auth = OAuthHandler(keysToUse['cons_key'], keysToUse['cons_secret'])
auth.set_access_token(keysToUse['access_token'], keysToUse['access_token_secret'])
api = tweepy.API(auth)
In [4]:
#load the list of twitter handles to scrape
handles=[]
with open('./Twitter/twitterhandles.json') as f:
handles= json.load(f)
print(handles)
root = './Twitter/tweets/'
In [5]:
#accepts a twitter handle and a name for the candidate, downloads all the tweets for that candidate and stores them in
def scrapeTweetsToFile(handle, name):
print(handle)
alltweets = []
newtweets=api.user_timeline(screen_name = handle, count=200)
alltweets.extend(newtweets)
oldest=alltweets[-1].id -1
while len(newtweets) > 0:
print('getting tweets before %s' %(oldest))
newtweets = api.user_timeline(screen_name=handle, count=200, max_id=oldest)
alltweets.extend(newtweets)
oldest=alltweets[-1].id - 1
print('...%s tweets downloaded so far' % len(alltweets))
outtweets = [[re.sub(r'[^\x00-\x7f]',r' ',tweet.text.replace('&', '&').strip("'").replace('"','').replace('\n', ' '))] for tweet in alltweets]
with open(os.path.join(root, '%s.txt' % name.replace(' ','-')) , 'w+', encoding='utf8') as f:
previous=''
#These skip flags ensure that a continued tweet is not split in two
skipnext=False
skipcurrent=False
#loop through all the tweets
for t in outtweets:
#move the skip next flag to the skip current flag
skipcurrent=skipnext
skipnext=False
#if there is a previous tweet and the current tweet starts with '...'
if previous!='':
#if the current tweet has .. in the last 6 chars (Trump is messy with his ellipsies)
if '..' in t[0][-6:]:
#set previous to the current tweet with the previous tweet appended to the end, removing all ellipsis-like patterns
previous=t[0].strip('...').replace('...',' ')+' '+previous.strip('...').replace('...',' ')
#set the flag to skip the next entry
skipnext=True
#if there is a previous tweet that is not a retweet or a reply, and there are no links in the tweet
if previous != '' and previous[:2] != 'RT' and not '@' in previous[:2] and 'http' not in previous and not skipcurrent:
#write the previous tweet to its file
f.write(previous+'\n')
#set the previous tweet
previous=t[0]
pass
In [6]:
#loop through the twitter handles and scrape the tweets of each one into a file
for handle in handles:
scrapeTweetsToFile(handles[handle], handle)