notebook.community

Edit and run



In [29]:

    
%matplotlib inline
import twitter
import json
import pandas as pd
import numpy as np
import time
#import matplotlib.pyplot as plt

# Import unquote to prevent url encoding errors in next_results
# The urllib module is split into urllib.parse, urllib.request, 
# and urllib.error in Python 3
# If running this in Python 2, change urllib.parse to urllib
from urllib import unquote



In [30]:

    
# OAuth login function for instantiating Twitter API object
def oauth_login():
    # XXX: Go to http://twitter.com/apps/new to create an app and get values
    # for these credentials that you'll need to provide in place of these
    # empty string values that are defined as placeholders.
    # See https://dev.twitter.com/docs/auth/oauth for more information 
    # on Twitter's OAuth implementation.
    

    
    auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                               CONSUMER_KEY, CONSUMER_SECRET)
    
    twitter_api = twitter.Twitter(auth=auth)
    return twitter_api



In [31]:

    
# Find the up to 600 most recent tweets containing a given hashtag
# The returned object contains a massive amount of data
def find_tweets(hashtag):
    # Ensure that the hashtag argument is a string
    assert type(hashtag) == str, 'Argument hashtag must be a string'
    # Ensure that the hashtag argument is, indeed, a hashtag 
    # in quotes, i.e. a string beginning with a pound sign
    assert hashtag[0] == '#', 'hashtag string must begin with a #'
    # Instantiate the Twitter API object 
    twitter_api = oauth_login()
    # Begin searching the Twitter API for tweets containing hashtag
    # Twitter only lets you find 100 tweets at a time 
    # (by default, the 100 most recent)
    search_results = twitter_api.search.tweets(q=hashtag, count=100, result_type='recent')
    
    # Extract the information on the (up to) 100 most recent tweets
    # as a list
    statuses = search_results['statuses']
    
    # Iterate through 5 more batches of results by following the cursor 
    # back in time
    for _ in range(5):
        #print("Length of statuses", len(statuses))
        try:
            next_results = search_results['search_metadata']['next_results']
        # The as statement is required in Python 3  
        # A comma would be required instead for Python 2.5 and earlier  
        # Python 2.6 and 2.7 support both the comma and the as statement
        except KeyError as e: # No more results when next_results doesn't exist
            break
        
        # Create a dictionary from next_results, which has the following form:
        # ?max_id=313519052523986943&q=NCAA&include_entities=1
        kwargs = dict([kv.split('=') for kv in unquote(next_results[1:]).split("&") ])    
        
        # Search for the 100 next most recent tweets
        search_results = twitter_api.search.tweets(**kwargs)
        # Append the results of the last search to the statuses list
        statuses += search_results['statuses']
        
    return statuses



In [6]:

    
# Resample/re-bin the time stamps to get a count of the tweets
# by time interval, which will vary by the the difference between
# the first and last time stamps
def resample_time_stamps(time_df):
    # Ensure that time_df is a DataFrame
    # The assert statement looks like this because 
    # we imported pandas as pd
    assert type(time_df) == pd.core.frame.DataFrame, 'Argument must be a pandas DataFrame'
    
    # Determine the range of times
    time_index  = time_df.index
    diff_year   = abs(time_index[0].year   - time_index[-1].year)
    diff_month  = abs(time_index[0].month  - time_index[-1].month)
    diff_day    = abs(time_index[0].day    - time_index[-1].day)
    diff_hour   = abs(time_index[0].hour   - time_index[-1].hour)
    diff_minute = abs(time_index[0].minute - time_index[-1].minute)
    diff_second = abs(time_index[0].second - time_index[-1].second)
    # Use the range of times to resample the data, 
    # and thus determine the plot format
    # Perhaps adjust this later
    if diff_year > 0:
        # Resample/bucket by month
        # The fillna(0) command fills any bins with NaN in them with 0s
        time_df = time_df.resample('M', how='sum').fillna(0)
        bin_width = 'month'
    elif diff_month > 0:
        if diff_month > 4:
            # Resample by week
            time_df = time_df.resample('W', how='sum').fillna(0)
            bin_width = 'week'
        else: 
            # Resample by day
            time_df = time_df.resample('D', how='sum').fillna(0)
            bin_width = 'day'
    elif diff_day > 0:
        if diff_day <= 10:
            # Resample by hour
            time_df = time_df.resample('H', how='sum').fillna(0)
            bin_width = 'hour'
        elif diff_day <= 21:
            # Resample by 6 hours
            time_df = time_df.resample('6H', how='sum').fillna(0)
            bin_width = '6 hours'
        else: 
            # Resample by day
            time_df = time_df.resample('D', how='sum').fillna(0)
            bin_width = 'day'
    elif diff_hour > 0:
        if diff_hour <= 3:
            # Resample by minute
            time_df = time_df.resample('T', how='sum').fillna(0)
            bin_width = 'minute'
        elif diff_hour <= 12:
            # Resample by 5 minutes
            time_df = time_df.resample('5T', how='sum').fillna(0)
            bin_width = '5 minutes'
        else: 
            # Resample by 10 minutes
            time_df = time_df.resample('10T', how='sum').fillna(0)
            bin_width = '10 minutes'
    elif diff_minute > 0:
        if diff_minute <= 3:
            # Resample by second
            time_df = time_df.resample('S', how='sum').fillna(0)
            bin_width = 'second'
        elif diff_minute <= 12:
            # Resample by 5 seconds
            time_df = time_df.resample('5S', how='sum').fillna(0)
            bin_width = '5 seconds'
        else: 
            # Resample by 10 seconds
            time_df = time_df.resample('10S', how='sum').fillna(0)
            bin_width = '10 seconds'
    elif abs(time_index[0].second - time_index[-1].second) > 0:
        # Resample/bucket (reorder) by second
        time_df = time_df.resample('S', how='sum').fillna(0)
        bin_width = 'second'
    else: 
        # Resample/bucket (reorder) by second
        time_df = time_df.resample('S', how='sum').fillna(0)
        bin_width = 'second'
    
    return time_df, bin_width



In [32]:

    
# Plot the time distribution of the (up to) 600 
# most recent tweets containing the desired hashtag
# Also returns a DataFrame containing the times 
# and locations (if available) of the tweets
def map_tweets(hashtag):
    # Ensure that the hashtag argument is a string
    assert type(hashtag) == str, 'Argument hashtag must be a string'
    # Ensure that the hashtag argument is, indeed, a hashtag 
    # in quotes, i.e. a string beginning with a pound sign
    assert hashtag[0] == '#', 'hashtag string must begin with a #'
    
    # Get all the data on the most recent tweets
    statuses = find_tweets(hashtag)
    
    # Initialize and populate a NumPy array with 
    # the time stamps of the tweets contained in 
    # statuses, in reverse chronological order
    time_stamps = np.array([])
    for i in range(len(statuses)):
        time_stamps = np.append(time_stamps, statuses[i]['created_at'])
    
    # Initialize and populate a NumPy array with 
    # the coordinates of the tweets contained in 
    # statuses, if available
    coordinates = np.array([])
    for i in range(len(statuses)):
        coordinates = np.append(coordinates, statuses[i]['coordinates'])
    
    # Create an array of ones for every tweet
    ones_array = np.ones(len(time_stamps))
    # Create a Pandas DatetimeIndex of the time stamps
    time_index = pd.DatetimeIndex(time_stamps)
    # Create a Pandas DataFrame showing one tweet 
    # for each time stamp
    time_df = pd.DataFrame(ones_array, index=time_index, columns=['Tweets'])
    # Create a DataFrame associating a location with 
    # each time stamp, if available
    coord_df = pd.DataFrame(coordinates, index=time_index, columns=['Coordinates'])
    
    # Resample the DataFrame time_df
    time_df, bin_width = resample_time_stamps(time_df)
    
    # Create the plot
    time_plot = time_df.plot(legend=False)
    time_plot.set_xlabel('Date/Time')
    time_plot.set_ylabel('Tweets per '+bin_width)
    time_plot.set_title('Frequency of tweets containing hashtag '+hashtag)
    
    return time_plot, coord_df



In [33]:

    
# Find as many tweets as possible containing a given hashtag 
# known to originate within search_radius of coords
# DO NOT CALL THIS UNLESS A TWITTER API OBJECT HAS BEEN INSTANTIATED!
def find_all_tweets(hashtag, coords, search_radius, api_calls):
    # First, ensure that the arguments are all of the 
    # proper type
    assert type(hashtag)       == str, 'Argument hashtag must be a string'
    assert type(coords)        == str, 'Argument coords must be a string'
    assert type(search_radius) == str, 'Argument search_radius must be a string'
    assert type(api_calls)     == int, 'Argument api_calls must be an integer'
    # Ensure that the hashtag argument is, indeed, a hashtag 
    # in quotes, i.e. a string beginning with a pound sign
    #assert hashtag[0] == '#', 'hashtag string must begin with a #'
    # Ensure that the search radius is expressed in either mi or km
    assert (search_radius[-2:] == 'mi') or (search_radius[-2:] == 'km'), \
      'search_radius must end in mi or km'
    
    # Twitter limits us to 180 calls to its API within a 15-minute span 
    # Setting this waiting period (in seconds) will allow us to pause 
    # the evaluation of this loop as necessary without having to restart 
    # it manually
    waiting_period = 15*60 
    # I know fully well that this works out to 900 seconds 
    # I merely thought that expressing the waiting period in min*60 sec/min 
    # would make it more obvious how long the waiting period is
    
    # Check to make sure we're not about to hit the API rate limit
    # If we are, wait 15 minutes before continuing 
    # and reset the API call counter
    if api_calls == 180:
        time.sleep(waiting_period)
        api_calls = 0
    
    # Find the 100 most recent tweets containing hashtag 
    # which originated within search_radius of coords, 
    # then extract the relevant information
    geocode_str = coords + ',' + search_radius
    search_results = \
      twitter_api.search.tweets(q=hashtag,count=100,result_type='recent',
                                geocode=geocode_str)
    statuses = search_results['statuses']
    # Update the counter
    api_calls += 1
    
    # Iterate through as many batches of results as possible 
    # by following the cursor back in time
    # First, initialize the Boolean variable indicating whether 
    # more search results exist
    more_results = True
    while more_results:
        #print("Length of statuses", len(statuses))
        try:
            next_results = search_results['search_metadata']['next_results']
        # The as statement is required in Python 3  
        # A comma would be required instead for Python 2.5 and earlier  
        # Python 2.6 and 2.7 support both the comma and the as statement
        except KeyError as e: # No more results when next_results doesn't exist
            more_results = False
            break
        
        # Create a dictionary from next_results, which has the following form:
        # ?max_id=313519052523986943&q=NCAA&include_entities=1
        kwargs = dict([kv.split('=') for kv in unquote(next_results[1:]).split("&") ]) 
        
        # Check to make sure we're not about to hit the API rate limit
        # If we are, wait 15 minutes before continuing
        # and reset the API call counter
        if api_calls == 180:
            time.sleep(waiting_period)
            api_calls = 0
        # Search for the 100 next most recent tweets
        search_results = twitter_api.search.tweets(**kwargs)
        # Append the results of the last search to the statuses list
        statuses += search_results['statuses']
        # Update the counter
        api_calls += 1
    
    return statuses, api_calls



In [42]:

    
# Extract the time stamp, coordinates, and user location 
# from the tweets, if available
def parse_tweets_for_coordinate(statuses, coord,city):
    # Initialize a list of the time stamps of the tweets 
    # contained in statuses, in reverse chronological order
    
    # Initialize a list of user locations
    user_locations = [] 
    locationInfo = [];
    for i in range(len(statuses)): 
        element = {};
        element['type'] = 'Feature';
        element['properties'] ={'timestamp':statuses[i]['created_at']}; 
        geomElement = {};
        geomElement['type']= "Point";
        a = coord.split(',');
        c2 = [float(a[1]), float(a[0])]
        geomElement['coordinates']=c2;
        element['geometry'] = geomElement;
        element['city'] = city;
        locationInfo.append(element);
         
    #     # Create an array of ones for every tweet
    #     ones_array = np.ones(len(time_stamps))
    #     # Create a Pandas DatetimeIndex of the time stamps
    #     time_index = pd.DatetimeIndex(time_stamps)
    #     # Create a Pandas DataFrame showing one tweet 
    #     # for each time stamp
    #     time_df = pd.DataFrame(ones_array, index=time_index, columns=['Tweets'])
    #     # Create a DataFrame associating a GPS coordinate and user location  
    #     # with each time stamp, if available
    #     coord_loc_data = {'Coordinates': tweet_coords, 'User Location': user_locations}
    #     coord_loc_df = pd.DataFrame(coord_loc_data, index=time_index)
    #     # Filter out entries without coordinates
    #     coord_loc_df = coord_loc_df[(coord_loc_df['Coordinates'] != 'None')]
    
    return locationInfo



In [45]:

    
# Create giant libraries of tweet information 
# containing time stamp, geocode, and user location
# Loop over hashtag, US metro area, and time, 
# as far back as the API will go

#hashtags = ['#Hillary2016', '#Sanders2016', 'Trump2016']
hashtags = ['HillaryClinton'];#, '#Sanders2016', 'Trump2016']
cities = ['New York', 'Los Angeles', 'Chicago', 
          'Houston', 'Philadelphia', 'Phoenix', 
          'San Antonio', 'San Diego', 'Dallas', 
          'San Jose', 'Austin', 'Jacksonville', 
          'San Francisco', 'Indianapolis', 'Columbus', 
          'Fort Worth', 'Charlotte', 'Detroit', 
          'El Paso', 'Seattle', 'Denver', 
          'Washington DC']
# Formatting each coordinate string as 'latitude, longitude' rather than
# 'latitude,longitude' apparently plays havoc with the search.tweets() 
# function
# Likewise, we apparently need geocode='latitude,longitude,search_radius' 
# in the argument list of search.tweets(), i.e. there can be no spaces 
# after the commas
coordinates = ['40.6643,-73.9385', '34.046281, -118.231898',#replacing LA'34.0194,-118.4108', 
               '41.8376,-87.6818', 
               '29.7805,-95.3863', '40.0094,-75.1333', '33.5722,-112.088', 
               '29.4724,-98.5251', '32.8153,-117.135', '32.7757,-96.7967', 
               '37.2969,-121.8193', '30.3072,-97.756', '30.337,-81.6613', 
               '37.7751,-122.4193', '39.7767,-86.1459', '39.9848,-82.985', 
               '37.7821120598956,-122.400612831116', '35.2087,-80.8307', '42.383,-83.1022', 
               '31.8484,-106.427', '47.6205,-122.3509', '39.7618,-104.8806', 
               '38.9041,-77.0171']
search_radius = '30km'

# Initialize the API call counter
api_calls = 0

# Instantiate the Twitter API object
twitter_api = oauth_login()
for hashtag in hashtags:
    locationInfo = [];
    for city, coords in zip(cities, coordinates):
        # Find as many tweets as possible containing hashtag 
        # originating within search_radius of coords
        statuses, api_calls = find_all_tweets(hashtag, coords, search_radius, api_calls)
        print len(statuses)
        
        # Extract the time stamps, coordinates, and user locations 
        # from the tweets found above
        locationInfo += parse_tweets_for_coordinate(statuses,coords, city)
    #GeoJSON Formatting of the features 
    geoj={};
    geoj['type']="FeatureCollection";
    geoj['features']=locationInfo;
    outputFile = 'friday_'+hashtag+'.geojson';
    with open(outputFile, 'w') as outfile:
        json.dump(geoj, outfile)
            
        print json.dumps(geoj, indent=1);        
        
        #         # Resample the DataFrame time_df
        #         time_df, bin_width = resample_time_stamps(time_df)

        #         # Create the plot
        #         time_plot = time_df.plot(legend=False)
        #         time_plot.set_xlabel('Date/Time')
        #         time_plot.set_ylabel('Tweets per '+bin_width)
        #         time_plot.set_title('Frequency of tweets containing hashtag '+hashtag
        #                             +'\n originating within '+search_radius
        #                             +' of '+city)
        
        # Save the plot
        #time_fig = time_plot.get_figure()
        #fig_name = hashtag + '_' + city + '_' + search_radius + '_time_plot.png'
        #time_fig.save(fig_name)









    



---------------------------------------------------------------------------
TwitterHTTPError                          Traceback (most recent call last)
<ipython-input-45-4fe1d95875e1> in <module>()
     41         # Find as many tweets as possible containing hashtag
     42         # originating within search_radius of coords
---> 43         statuses, api_calls = find_all_tweets(hashtag, coords, search_radius, api_calls)
     44         print len(statuses)
     45 

<ipython-input-33-8dcf9bf3b1e8> in find_all_tweets(hashtag, coords, search_radius, api_calls)
     36     geocode_str = coords + ',' + search_radius
     37     search_results =       twitter_api.search.tweets(q=hashtag,count=100,result_type='recent',
---> 38                                 geocode=geocode_str)
     39     statuses = search_results['statuses']
     40     # Update the counter

//anaconda/lib/python2.7/site-packages/twitter/api.pyc in __call__(self, **kwargs)
    306             return self._handle_response_with_retry(req, uri, arg_data, _timeout)
    307         else:
--> 308             return self._handle_response(req, uri, arg_data, _timeout)
    309 
    310     def _handle_response(self, req, uri, arg_data, _timeout=None):

//anaconda/lib/python2.7/site-packages/twitter/api.pyc in _handle_response(self, req, uri, arg_data, _timeout)
    339                 return []
    340             else:
--> 341                 raise TwitterHTTPError(e, uri, self.format, arg_data)
    342 
    343     def _handle_response_with_retry(self, req, uri, arg_data, _timeout=None):

TwitterHTTPError: Twitter sent status 429 for URL: 1.1/search/tweets.json using parameters: (count=100&geocode=40.6643%2C-73.9385%2C30km&oauth_consumer_key=JrPeKVNTJI8aK4dPQ4cI1qgJb&oauth_nonce=16327731599181864152&oauth_signature_method=HMAC-SHA1&oauth_timestamp=1438381894&oauth_token=250500792-NtyWwlGsZq54lMqZpiSBG70tUAOr1fKdbJdz4y5L&oauth_version=1.0&q=HillaryClinton&result_type=recent&oauth_signature=Xa9Xjqw6itqbE%2F4ZMxywtm5hxE4%3D)
details: {u'errors': [{u'message': u'Rate limit exceeded', u'code': 88}]}



In [39]:

    
c ='40.6643,-73.9385';

a = c.split(',');
print a
c2 = [float(a[0]), float(a[1])]
print c2









    



['40.6643', '-73.9385']
[40.6643, -73.9385]



In [ ]: