Tweet Scraper

This notebook uses tweepy and a previously made twitter account to scrape every tweet from given twitter accounts. Edit the handles.txt file to change the twitter accounts set to download


In [1]:
import oauth2 as oauth
import urllib.request
from pprint import pprint
import csv
import re
import os
import tweepy
from tweepy import OAuthHandler
import json

In [2]:
keys_path = './twitterkeys.txt'
keys=[]

#load the twitter api keys
with open(keys_path) as f:
    keys=json.load(f)

In [3]:
accountToUse = 'Donald Trump'
keysToUse = keys[accountToUse]
auth = OAuthHandler(keysToUse['cons_key'], keysToUse['cons_secret'])
auth.set_access_token(keysToUse['access_token'], keysToUse['access_token_secret'])

api = tweepy.API(auth)

In [4]:
#load the list of twitter handles to scrape
handles=[]

with open('./Twitter/twitterhandles.json') as f:
    handles= json.load(f)
print(handles)

root = './Twitter/tweets/'


{'Ted Cruz': 'tedcruz', 'Hillary Clinton': 'HillaryClinton', 'Barack Obama': 'BarackObama', 'Bernie Sanders': 'BernieSanders', 'President Obama': 'potus44', 'Donald Trump': 'realDonaldTrump'}

In [5]:
#accepts a twitter handle and a name for the candidate, downloads all the tweets for that candidate and stores them in 
def scrapeTweetsToFile(handle, name):
    
    print(handle)
    alltweets = []

    newtweets=api.user_timeline(screen_name = handle, count=200)
    alltweets.extend(newtweets)
    oldest=alltweets[-1].id -1

    while len(newtweets) > 0:
        print('getting tweets before %s' %(oldest))
        newtweets = api.user_timeline(screen_name=handle, count=200, max_id=oldest)    
        alltweets.extend(newtweets)    
        oldest=alltweets[-1].id - 1
        print('...%s tweets downloaded so far' % len(alltweets))

    outtweets = [[re.sub(r'[^\x00-\x7f]',r' ',tweet.text.replace('&', '&').strip("'").replace('"','').replace('\n', ' '))] for tweet in alltweets]

    with open(os.path.join(root, '%s.txt' % name.replace(' ','-')) , 'w+', encoding='utf8') as f:

        previous=''
        
        #These skip flags ensure that a continued tweet is not split in two
        skipnext=False
        skipcurrent=False
        
        #loop through all the tweets
        for t in outtweets:
            #move the skip next flag to the skip current flag
            skipcurrent=skipnext
            skipnext=False
            
            #if there is a previous tweet and the current tweet starts with '...'
            if previous!='':
                #if the current tweet has .. in the last 6 chars (Trump is messy with his ellipsies)
                if '..' in t[0][-6:]:
                    #set previous to the current tweet with the previous tweet appended to the end, removing all ellipsis-like patterns
                    previous=t[0].strip('...').replace('...',' ')+' '+previous.strip('...').replace('...',' ')
                    #set the flag to skip the next entry
                    skipnext=True
            #if there is a previous tweet that is not a retweet or a reply, and there are no links in the tweet
            if previous != '' and previous[:2] != 'RT' and not '@' in previous[:2] and 'http' not in previous and not skipcurrent: 
                #write the previous tweet to its file
                f.write(previous+'\n') 
            
            #set the previous tweet
            previous=t[0]
    pass

In [6]:
#loop through the twitter handles and scrape the tweets of each one into a file
for handle in handles:
    scrapeTweetsToFile(handles[handle], handle)


tedcruz
getting tweets before 846782074677837823
...400 tweets downloaded so far
getting tweets before 831141818624536576
...600 tweets downloaded so far
getting tweets before 819232954513813504
...800 tweets downloaded so far
getting tweets before 806560962421620735
...999 tweets downloaded so far
getting tweets before 796126100615217152
...1195 tweets downloaded so far
getting tweets before 789272928256790527
...1395 tweets downloaded so far
getting tweets before 781490806817366016
...1595 tweets downloaded so far
getting tweets before 772509287012704255
...1795 tweets downloaded so far
getting tweets before 759008258065510399
...1995 tweets downloaded so far
getting tweets before 748131629235765247
...2195 tweets downloaded so far
getting tweets before 734173054801039359
...2393 tweets downloaded so far
getting tweets before 726026976679448575
...2592 tweets downloaded so far
getting tweets before 723883307906174975
...2791 tweets downloaded so far
getting tweets before 721741682216083455
...2989 tweets downloaded so far
getting tweets before 719675652828303359
...3187 tweets downloaded so far
getting tweets before 716729038773964799
...3224 tweets downloaded so far
getting tweets before 716304071011885056
...3224 tweets downloaded so far
HillaryClinton
getting tweets before 795440369039187967
...400 tweets downloaded so far
getting tweets before 793513555035291647
...600 tweets downloaded so far
getting tweets before 790730465175011327
...800 tweets downloaded so far
getting tweets before 788909459586314239
...1000 tweets downloaded so far
getting tweets before 785894612686757887
...1200 tweets downloaded so far
getting tweets before 783657327396265984
...1400 tweets downloaded so far
getting tweets before 781890393185058815
...1600 tweets downloaded so far
getting tweets before 780081425714335746
...1800 tweets downloaded so far
getting tweets before 776782482352009215
...2000 tweets downloaded so far
getting tweets before 774021824053116927
...2200 tweets downloaded so far
getting tweets before 770291230966251519
...2400 tweets downloaded so far
getting tweets before 765288353419890687
...2600 tweets downloaded so far
getting tweets before 761602482774675456
...2800 tweets downloaded so far
getting tweets before 758805145190707199
...3000 tweets downloaded so far
getting tweets before 757973312706318335
...3200 tweets downloaded so far
getting tweets before 756314843376918527
...3239 tweets downloaded so far
getting tweets before 755950742310154240
...3239 tweets downloaded so far
BarackObama
getting tweets before 773549731901349887
...400 tweets downloaded so far
getting tweets before 747152944047362048
...600 tweets downloaded so far
getting tweets before 723638166243016703
...800 tweets downloaded so far
getting tweets before 702936121659920383
...1000 tweets downloaded so far
getting tweets before 686966611064926207
...1200 tweets downloaded so far
getting tweets before 666354177400897539
...1400 tweets downloaded so far
getting tweets before 646093630315192319
...1600 tweets downloaded so far
getting tweets before 627914442227888127
...1800 tweets downloaded so far
getting tweets before 614505702296453119
...2000 tweets downloaded so far
getting tweets before 596717288566358015
...2200 tweets downloaded so far
getting tweets before 578308711833214975
...2399 tweets downloaded so far
getting tweets before 557738590818795519
...2599 tweets downloaded so far
getting tweets before 544580050012491775
...2798 tweets downloaded so far
getting tweets before 532909143573528575
...2998 tweets downloaded so far
getting tweets before 515870800424222719
...3198 tweets downloaded so far
getting tweets before 497412858599075841
...3231 tweets downloaded so far
getting tweets before 494959325102620672
...3231 tweets downloaded so far
BernieSanders
getting tweets before 831288596778905599
...400 tweets downloaded so far
getting tweets before 814195718810238976
---------------------------------------------------------------------------
TweepError                                Traceback (most recent call last)
<ipython-input-6-e73c09dcaa48> in <module>()
      1 #loop through the twitter handles and scrape the tweets of each one into a file
      2 for handle in handles:
----> 3     scrapeTweetsToFile(handles[handle], handle)

<ipython-input-5-aa2adec5a815> in scrapeTweetsToFile(handle, name)
     11     while len(newtweets) > 0:
     12         print('getting tweets before %s' %(oldest))
---> 13         newtweets = api.user_timeline(screen_name=handle, count=200, max_id=oldest)
     14         alltweets.extend(newtweets)
     15         oldest=alltweets[-1].id - 1

C:\Users\nick\Anaconda3\lib\site-packages\tweepy\binder.py in _call(*args, **kwargs)
    243             return method
    244         else:
--> 245             return method.execute()
    246 
    247     # Set pagination mode

C:\Users\nick\Anaconda3\lib\site-packages\tweepy\binder.py in execute(self)
    227                     raise RateLimitError(error_msg, resp)
    228                 else:
--> 229                     raise TweepError(error_msg, resp, api_code=api_error_code)
    230 
    231             # Parse the response payload

TweepError: [{'code': 131, 'message': 'Internal error'}]