Tweet Scraper

This notebook uses tweepy and a previously made twitter account to scrape every tweet from given twitter accounts. Edit the handles.txt file to change the twitter accounts set to download

In [1]:
import oauth2 as oauth
import urllib.request
from pprint import pprint
import csv
import re
import os
import tweepy
from tweepy import OAuthHandler
import json

In [2]:
keys_path = './twitterkeys.txt'

#load the twitter api keys
with open(keys_path) as f:

In [3]:
accountToUse = 'Donald Trump'
keysToUse = keys[accountToUse]
auth = OAuthHandler(keysToUse['cons_key'], keysToUse['cons_secret'])
auth.set_access_token(keysToUse['access_token'], keysToUse['access_token_secret'])

api = tweepy.API(auth)

In [4]:
#load the list of twitter handles to scrape

with open('./Twitter/twitterhandles.json') as f:
    handles= json.load(f)

root = './Twitter/tweets/'

{'Ted Cruz': 'tedcruz', 'Hillary Clinton': 'HillaryClinton', 'Barack Obama': 'BarackObama', 'Bernie Sanders': 'BernieSanders', 'President Obama': 'potus44', 'Donald Trump': 'realDonaldTrump'}

In [5]:
#accepts a twitter handle and a name for the candidate, downloads all the tweets for that candidate and stores them in 
def scrapeTweetsToFile(handle, name):
    alltweets = []

    newtweets=api.user_timeline(screen_name = handle, count=200)
    oldest=alltweets[-1].id -1

    while len(newtweets) > 0:
        print('getting tweets before %s' %(oldest))
        newtweets = api.user_timeline(screen_name=handle, count=200, max_id=oldest)    
        oldest=alltweets[-1].id - 1
        print('...%s tweets downloaded so far' % len(alltweets))

    outtweets = [[re.sub(r'[^\x00-\x7f]',r' ',tweet.text.replace('&', '&').strip("'").replace('"','').replace('\n', ' '))] for tweet in alltweets]

    with open(os.path.join(root, '%s.txt' % name.replace(' ','-')) , 'w+', encoding='utf8') as f:

        #These skip flags ensure that a continued tweet is not split in two
        #loop through all the tweets
        for t in outtweets:
            #move the skip next flag to the skip current flag
            #if there is a previous tweet and the current tweet starts with '...'
            if previous!='':
                #if the current tweet has .. in the last 6 chars (Trump is messy with his ellipsies)
                if '..' in t[0][-6:]:
                    #set previous to the current tweet with the previous tweet appended to the end, removing all ellipsis-like patterns
                    previous=t[0].strip('...').replace('...',' ')+' '+previous.strip('...').replace('...',' ')
                    #set the flag to skip the next entry
            #if there is a previous tweet that is not a retweet or a reply, and there are no links in the tweet
            if previous != '' and previous[:2] != 'RT' and not '@' in previous[:2] and 'http' not in previous and not skipcurrent: 
                #write the previous tweet to its file
            #set the previous tweet

In [6]:
#loop through the twitter handles and scrape the tweets of each one into a file
for handle in handles:
    scrapeTweetsToFile(handles[handle], handle)

getting tweets before 831288596778905599
...400 tweets downloaded so far
getting tweets before 814195718810238976
TweepError                                Traceback (most recent call last)
<ipython-input-6-e73c09dcaa48> in <module>()
      1 #loop through the twitter handles and scrape the tweets of each one into a file
      2 for handle in handles:
----> 3     scrapeTweetsToFile(handles[handle], handle)

<ipython-input-5-aa2adec5a815> in scrapeTweetsToFile(handle, name)
     11     while len(newtweets) > 0:
     12         print('getting tweets before %s' %(oldest))
---> 13         newtweets = api.user_timeline(screen_name=handle, count=200, max_id=oldest)
     14         alltweets.extend(newtweets)
     15         oldest=alltweets[-1].id - 1

C:\Users\nick\Anaconda3\lib\site-packages\tweepy\ in _call(*args, **kwargs)
    243             return method
    244         else:
--> 245             return method.execute()
    247     # Set pagination mode

C:\Users\nick\Anaconda3\lib\site-packages\tweepy\ in execute(self)
    227                     raise RateLimitError(error_msg, resp)
    228                 else:
--> 229                     raise TweepError(error_msg, resp, api_code=api_error_code)
    231             # Parse the response payload

TweepError: [{'code': 131, 'message': 'Internal error'}]