In [2]:
import tweepy
import re
import json

import sqlite3 as lite

import pandas as pd

import datetime, time, os, sys
import argparse, configparser
Config = configparser.ConfigParser()
Config.read('config.cnf')

consumer_key = Config.get('twittersfupubresearch', 'consumer_key')
consumer_secret = Config.get('twittersfupubresearch', 'consumer_secret')
access_token = Config.get('twittersfupubresearch', 'access_token')
access_token_secret = Config.get('twittersfupubresearch', 'access_token_secret')

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
# set up access to the Twitter API
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [24]:
litecon = lite.connect('data/swcc.db')

with litecon:

# set up SQL tables
    litecur = litecon.cursor()
    # the users that were found
    litecur.execute("CREATE TABLE IF NOT EXISTS users (user_id TEXT, screen_name TEXT, user_object TEXT, fetch_followers TEXT, error TEXT, user_modified TEXT)")
    litecur.execute("CREATE UNIQUE INDEX IF NOT EXISTS users_user_id ON users (user_id)")
    litecur.execute("CREATE INDEX IF NOT EXISTS users_screen_name ON users (screen_name)")    

    litecur.execute("CREATE TABLE IF NOT EXISTS followers (user_id TEXT, follower_id TEXT, modified TEXT)")
    litecur.execute("CREATE INDEX IF NOT EXISTS followers_user_id ON followers (user_id)")
    litecur.execute("CREATE UNIQUE INDEX IF NOT EXISTS followers_user_follower_id ON followers (user_id, follower_id)")

In [25]:
def get_missing_users():
    df = pd.read_excel('data/swcc-twitter.xlsx')
    seed_data = set(df['ID-Author'].str.lower())
    del df

    with litecon:
    # set up SQL tables
        litecur = litecon.cursor()
        cur = litecur.execute("SELECT screen_name FROM users")
        fetched = cur.fetchall()
        fetched = [x[0].lower() for x in fetched]

    ret = seed_data.difference(fetched)
    print("%s remaining" % len(ret))
    return ret

In [ ]:
def __insert_user(user, fetch_followers = 0):
    now = datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")
    with litecon:
        litecur = litecon.cursor()
        try:
            litecur.execute('INSERT INTO users (user_id, screen_name, user_object, fetch_followers, user_modified) VALUES (?, ?, ?, ?, ?)', (user.id, user.screen_name, json.dumps(user._json), fetch_followers, now))
        except lite.IntegrityError:
            # don't worry about duplicates
            pass
        
def __insert_user_error(error, user_id = None, screen_name = None):
    now = datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")
    with litecon:
        litecur = litecon.cursor()
        try:
            litecur.execute('INSERT INTO users (user_id, screen_name, error, user_modified) VALUES (?, ?, ?, ?)', (user_id, screen_name, error, now))
        except lite.IntegrityError:
            # don't worry about duplicates
            pass

def get_user(user_id):
    try:
        user = api.get_user(user_id)
        __insert_user(user, fetch_followers=1)
    except tweepy.TweepError as error:
        __insert_user_error(error.response.reason, user_id=None, screen_name=screen_name)

for screen_name in get_missing_users():
    get_user(screen_name)

In [31]:
def __save_followers(user_id, ids):
    '''
    Do the actual SQLite update with the info collected
    '''
    now = datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")
    with litecon:
        litecur = litecon.cursor()
        
        print( 'saving', len(ids))
        for f in ids:
            try: 
                litecur.execute('INSERT INTO followers (user_id, follower_id, modified) VALUES (?, ?, ?)', (user_id, f, now))
            except lite.IntegrityError:
                pass # ignore duplicates, they wont change the network
            
def __save_follower_error(user_id):
    now = datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")
    with litecon:
        litecur = litecon.cursor() 
    litecur.execute('INSERT INTO followers (user_id, follower_id, modified) VALUES (?, ?, ?)', (user_id, -1, now))
    
    
def get_followers(user_id = None):
    '''
    Get the followers list for all users (or for a specific user)
    '''
    if user_id is None:       
        while (True):
            with litecon:
                litecur = litecon.cursor()
                litecur.execute('SELECT u.user_id FROM users u LEFT JOIN followers f ON (u.user_id = f.user_id) WHERE f.user_id IS NULL and fetch_followers = 1')
        
            # go 100 at a time so we're not hitting the DB so much
            users = [u[0] for u in litecur.fetchmany(100)]
            if not users: break

            for user_id in users:
                ids = get_followers(user_id)
    else: 
        try:
            ids = []
            for page in tweepy.Cursor(api.followers_ids, id=user_id).pages():
                ids.extend(page)

            if len(ids) > 0: 
                __save_followers(user_id, ids)
            else:
                __save_follower_error(user_id)

        except tweepy.TweepError as error: 
            pass

get_followers()


saving 1242
saving 490
saving 1035
saving 4123
saving 121
saving 11166
saving 304
saving 608
saving 422
saving 1668
saving 2
saving 5910
Rate limit reached. Sleeping for: 897
saving 1337
saving 1471
saving 739
saving 4297
saving 1481
saving 139
saving 583
saving 800
saving 117
saving 95
saving 716
saving 1423
saving 3926
saving 400
Rate limit reached. Sleeping for: 897
saving 1744
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-31-3744e124703b> in <module>()
     51             pass
     52 
---> 53 get_followers()

<ipython-input-31-3744e124703b> in get_followers(user_id)
     36 
     37             for user_id in users:
---> 38                 ids = get_followers(user_id)
     39     else:
     40         try:

<ipython-input-31-3744e124703b> in get_followers(user_id)
     40         try:
     41             ids = []
---> 42             for page in tweepy.Cursor(api.followers_ids, id=user_id).pages():
     43                 ids.extend(page)
     44 

/usr/local/lib/python3.6/site-packages/tweepy/cursor.py in __next__(self)
     47 
     48     def __next__(self):
---> 49         return self.next()
     50 
     51     def next(self):

/usr/local/lib/python3.6/site-packages/tweepy/cursor.py in next(self)
     73         data, cursors = self.method(cursor=self.next_cursor,
     74                                     *self.args,
---> 75                                     **self.kargs)
     76         self.prev_cursor, self.next_cursor = cursors
     77         if len(data) == 0:

/usr/local/lib/python3.6/site-packages/tweepy/binder.py in _call(*args, **kwargs)
    248             return method
    249         else:
--> 250             return method.execute()
    251 
    252     # Set pagination mode

/usr/local/lib/python3.6/site-packages/tweepy/binder.py in execute(self)
    162                                     if self.wait_on_rate_limit_notify:
    163                                         log.warning("Rate limit reached. Sleeping for: %d" % sleep_time)
--> 164                                     time.sleep(sleep_time + 5)  # sleep for few extra sec
    165 
    166                 # if self.wait_on_rate_limit and self._reset_time is not None and \

KeyboardInterrupt: 

In [ ]:
def get_users_of_followers():
    while (True):
        with litecon:
            litecur = litecon.cursor()
            litecur.execute('SELECT f.follower_id FROM followers f LEFT JOIN users u ON (f.follower_id = u.user_id) WHERE u.user_id IS NULL')

        # go 100 at a time so we're not hitting the DB so much
        users = [u[0] for u in litecur.fetchmany(100)]
        if not users: break

        for user_id in users:
            get_user(user_id)