In [ ]:
import twitter
In [ ]:
import json
fp = open('/home/anshuman/.twitter_api_keys.json', 'r')
kwargs = json.load(fp)
twitter_keys = kwargs
fp.close()
api = twitter.Api(**kwargs)
In [ ]:
print api.VerifyCredentials()
In [ ]:
# %timeit statuses = api.GetUserTimeline(screen_name="BarackObama")
# print [s.text for s in statuses]
In [ ]:
from twitter import TwitterError
In [ ]:
# Northampton city Coords
north_lat = 52.240477000000000000
north_long = -0.902655999999979000
north_radius = 50
radius_units= 'km'
results = api.GetSearch(geocode=(north_lat, north_long, str(north_radius) + radius_units), count= 100)
In [ ]:
statuses = [s.text for s in results]
for s in statuses:
print('----')
print(s)
print('----')
In [ ]:
results
Twitter provides a streaming API for downloading timelines or search results. To use it, we need to make a persistent HTTP request, and twitter returns us the results in real time, as if they were an infinite stream of data. References:
In [ ]:
import urllib
from hashlib import sha1
import hmac
import random
import time
import base64
import re, string
random.seed()
re_nonword = re.compile(r'[\W_]+')
re_28 = re.compile(r'(\%28)')
re_2C = re.compile(r'(\%2C)')
re_29 = re.compile(r'(\%29)')
def get_nonword(orig):
return re_nonword.sub('', orig)
def get_nonce() :
return get_nonword(base64.b64encode(str(random.getrandbits(32))))
def get_timestamp():
return int(time.time())
def sub_escchars(orig):
''' No idea why, but the Twitter signature generator encodes brackets, comma etc. as %25XX instead of %XX, where XX
is the normal escape code for that non-word character.
This method replicates the same thing.
'''
return re_29.sub('%2529', re_2C.sub('%252C', re_28.sub('%2528', orig)))
def sign_request(consumer_secret,
token_secret,
req_type,
url,
argmap):
'''
Steps for generating the signature:
1. Sort the keys in argmap alphabetically and join the key/value pairs in the format:
base_string <- key1=value1&key2=value2...&keyN=valueN
2. This is the 'base string', %-encode this string.
base_string <- %-encode(base_string)
3. %-encode the URL, and prepend it to the above base string with the '&' separator as:
base_string <- enc_url&base_string
4. Prepend the Request Type ("GET", "POST") to the string above using '&' as separator
base_string <- req_type&base_string
5. Generate the siging key as:
key <- enc_consumer_secret&enc_token_secret
where, enc_consumer_secret and enc_token_secret are %-encoded OAuth consumer & token secrets respectively.
6. Generate the SHA1 HMAC by signing the base_string with the key.
signature <- %-encode(base64(hmac(base_string,key)))
'''
# %-encode the secrets for the signing key, as per specification
enc_consumer_secret = urllib.quote_plus(consumer_secret)
enc_token_secret = urllib.quote_plus(token_secret)
print(argmap)
args = sorted([ key + '=' + argmap[key] for key in argmap ])
# The Base String as specified here:
args_str = '&'.join(args) # as specified by oauth
args_str = urllib.quote_plus(args_str)
enc_url = urllib.quote_plus(url)
args_str = req_type + "&" + enc_url + "&" + args_str
args_str = sub_escchars(args_str)
# key = CONSUMER_SECRET& #If you dont have a token yet
key=""
if enc_token_secret is None:
key = enc_consumer_secret + '&'
else:
key = enc_consumer_secret + '&' + enc_token_secret
#key = &TOKEN_SECRET"
print("Base string : %s" % args_str)
print("sigining key : %s" % key)
hashed = hmac.new(key, args_str, sha1)
hashed_b64 = hashed.digest().encode("base64").rstrip('\n')
print("Signature : %s" % hashed_b64)
# The signature
return urllib.quote_plus(hashed_b64)
# The URL to query
url = 'https://stream.twitter.com/1.1/statuses/filter.json'
#include_entities true
#geocode = '(52.240477000000000000,-0.902655999999979000,50km)'
northants_box = '-1.386293,51.985165,-0.282167,52.650010'
london_box = '-0.567680,51.277729,0.289254,51.701847'
track='india'
''' Consumer Key
'''
oauth_consumer_key = twitter_keys["consumer_key"]
'''Nonce
The oauth_nonce parameter is a unique token your application should generate for each unique request.
Twitter will use this value to determine whether a request has been submitted multiple times. The value
for this request was generated by base64 encoding 32 bytes of random data, and stripping out all non-word
characters, but any approach which produces a relatively random alphanumeric string should be OK here.
'''
oauth_nonce = get_nonce()
oauth_signature_method = "HMAC-SHA1"
'''Timestamp
The oauth_timestamp parameter indicates when the request was created. This value should be the number of seconds
since the Unix epoch at the point the request is generated, and should be easily generated in most programming
languages. Twitter will reject requests which were created too far in the past, so it is important to keep the
clock of the computer generating requests in sync with NTP.
'''
oauth_timestamp = get_timestamp()
oauth_token = twitter_keys["access_token_key"]
oauth_version = "1.0"
# The Parameters used in the base string for signing
argmap = {
'locations' : northants_box,
#'track' : track,
'oauth_consumer_key' : oauth_consumer_key,
'oauth_nonce' : oauth_nonce,
'oauth_signature_method' : oauth_signature_method,
'oauth_timestamp' : str(oauth_timestamp),
'oauth_token' : oauth_token,
'oauth_version' : oauth_version,
}
# Parameters that constitute the siging key
oauth_consumer_secret = twitter_keys["consumer_secret"]
oauth_token_secret = twitter_keys["access_token_secret"]
# Generate the oauth request signature
oauth_signature = sign_request(oauth_consumer_secret, oauth_token_secret, "GET", url, argmap)
# The signing key is simply the percent encoded consumer secret, followed by an ampersand character ‘&’,
# followed by the percent encoded token secret:
req_args = {
'locations' : northants_box,
#'track' : track,
}
twurl = url
# Generate the %-encoded request args
twdata = urllib.urlencode(req_args)
# Set up the request header
twheader = 'Authorization: OAuth oauth_consumer_key="%s", oauth_nonce="%s", oauth_signature="%s", oauth_signature_method="HMAC-SHA1", oauth_timestamp="%d", oauth_token="%s", oauth_version="1.0"' % (oauth_consumer_key, oauth_nonce, oauth_signature, oauth_timestamp, oauth_token)
# Generate the cURL command line
twcurl_cmd = "curl --get " + twurl + " --data '" + twdata + "' --header '" + twheader + "' --verbose"
# Run this command to stream the search results.
print("cURL command line:")
print(twcurl_cmd)
The following generated command can be used to download filtered twitter statuses using th streaming API:
$ curl --get https://stream.twitter.com/1.1/statuses/filter.json --data 'locations=-1.386293%2C51.985165%2C-0.282167%2C52.650010' --header 'Authorization: OAuth oauth_consumer_key="XXXXXXXX", oauth_nonce="MzA1OTUzNjg3MQ", oauth_signature="PooL0huM2TirKvTZCroKjeRtI5Q%3D", oauth_signature_method="HMAC-SHA1", oauth_timestamp="1423089806", oauth_token="YYYYYYYY", oauth_version="1.0"' --verbose
In [1]:
import json
import pandas as pd
import numpy as np
def process_results(fp, max=1000):
lines = 0
errors = 0
while (lines < max):
try:
nextLine = fp.readline()
lines += 1
except:
errors += 1
continue
try:
result = json.loads(nextLine)
yield result
except:
errors += 1
continue
print(errors)
def make_map(fp):
for r in process_results(fp):
text = r["text"]
#print(text)
#print(r["geo"])
if ("geo" in r and r["geo"] is None):
(xlon, ylat) = (np.nan, np.nan)
else:
xlon = r["geo"]["coordinates"][0]
ylat = r["geo"]["coordinates"][1]
created_at = r["created_at"]
tid = r["id"]
user = "@" + r["user"]["screen_name"]
d_row = {
"id" : tid,
"status" : text,
"user" : user,
"created_at" : created_at,
"latitude" : xlon,
"longitude" : ylat,
}
yield d_row
fp = open("/home/anshuman/northants/twitter_search_results.json", "r")
try:
df = pd.DataFrame(make_map(fp))
finally:
fp.close()
df
Out[1]:
In [ ]: