In [1]:
import tweepy
import re
import json
import sqlite3 as lite
import pandas as pd
import datetime, time, os, sys
import argparse, configparser
Config = configparser.ConfigParser()
Config.read('config.cnf')
consumer_key = Config.get('twittersfupubresearch', 'consumer_key')
consumer_secret = Config.get('twittersfupubresearch', 'consumer_secret')
access_token = Config.get('twittersfupubresearch', 'access_token')
access_token_secret = Config.get('twittersfupubresearch', 'access_token_secret')
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
# set up access to the Twitter API
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
In [8]:
df = pd.read_excel('data/Pundits_Altmetric_Coded_Recoded_Current.xlsx')
screen_names = list(df.Author_ID_On_Source.dropna().unique())
In [21]:
pundits = {}
timelines = {}
for pundit in screen_names:
pundits[pundit] = api.get_user(screen_name=pundit)
timelines[pundit] = api.user_timeline(screen_name=pundit, count=100)
In [59]:
for pundit in screen_names:
try:
status = timelines[pundit][-1]
l = len(timelines[pundit])
# print(pundit, "%.1f" % float(l/(datetime.datetime.today() - status.created_at).days))
diff = datetime.datetime.now() - timelines[pundit][-1].created_at
print("%s\t%.1f" % (pundit, float(l/(diff.days + diff.seconds/60/60/24))))
except ZeroDivisionError:
print(pundit, "100+")
In [10]:
try:
tweet = api.get_status(39383838383838383838383)
except tweepy.TweepError as error:
err = error
user = api.get_user('juancommander')
print(error.reason[0])
In [21]:
err.args[0][0]['message'] += '; juan'
In [15]:
jsonr
Out[15]:
In [3]:
evolBioCon = sqlite3.connect("data/BMCevolBioSample.db")
bioCon = sqlite3.connect("data/BMCbioSample.db")
In [6]:
def load_user_errors(con):
return pd.read_sql("SELECT DISTINCT old_screen_name, error FROM sample WHERE error IS NOT NULL ", con, index_col='old_screen_name')
df = load_user_errors(evolBioCon).append(load_user_errors(bioCon))
In [8]:
user = api.get_user('juancommander')
In [36]:
df = pd.read_sql("SELECT doi, tweet_id, old_screen_name, tweet FROM sample WHERE tweet IS NOT NULL ", litecon, index_col='tweet_id')
df = df[~df.tweet.isnull()]
df['tweet'] = df.tweet.apply(lambda x: json.loads(x) if x is not None else None)
df['created_at'] = df.tweet.apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(x['created_at'],'%a %b %d %H:%M:%S +0000 %Y')))
df['created_at'] = pd.to_datetime(df.created_at)
df['created_at_dayofweek'] = df.tweet.apply(lambda x: x['created_at'][0:3])
df['user'] = df.tweet.apply(lambda x: x['user'])
df['screen_name'] = df.tweet.apply(lambda x: x['user']['screen_name'])
# df['user_id'] = df.tweet.apply(lambda x: int(x['user']['id_str']))
# df['user_utc_offset'] = df.tweet.apply(lambda x: x['user']['utc_offset'])
# df['user_name'] = df.tweet.apply(lambda x: x['user']['name'])
# df['user_followers_count'] = df.tweet.apply(lambda x: x['user']['followers_count'])
# df['user_friends_count'] = df.tweet.apply(lambda x: x['user']['friends_count'])
# df['user_description'] = df.tweet.apply(lambda x: re.sub( '\s+', ' ', x['user']['description']).strip())
# df['user_statuses_count'] = df.tweet.apply(lambda x: x['user']['statuses_count'])
df['is_retweet'] = df.tweet.apply(lambda x: 'retweeted_status' in x)
df['is_retweet'] = df['is_retweet'].fillna(False)
df['retweet_of_status_id_str'] = df.tweet.apply(lambda x: x['retweeted_status']['id_str'] if 'retweeted_status' in x else None)
df['retweet_of_screen_name'] = df.tweet.apply(lambda x: x['retweeted_status']['user']['screen_name'] if 'retweeted_status' in x else None)
df['is_reply'] = df.tweet.apply(lambda x: x['in_reply_to_status_id'] != None)
df['in_reply_to_status_id_str'] = df.tweet.apply(lambda x: x['in_reply_to_status_id_str'])
df['in_reply_to_screen_name'] = df.tweet.apply(lambda x: x['in_reply_to_screen_name'])
df['text'] = df.tweet.apply(lambda x: re.sub( '\s+', ' ', x['text']).strip()) # remove commas for CSV simplicity
del df['tweet']
tweetdetails = df.sort_index()
del df
df = pd.read_sql("SELECT doi, tweet_id, old_screen_name FROM sample WHERE error LIKE '%screen_name%'", litecon, index_col='old_screen_name')
users_df = pd.read_sql("SELECT screen_name, user_object FROM users", litecon, index_col='screen_name')
users_df['user'] = users_df.user_object.map(json.loads)
del users_df['user_object']
df = df.join(users_df, how="inner")
df.index.name = 'screen_name'
df = df.reset_index().set_index('tweet_id')
tweetdetails = tweetdetails.append(df).sort_index()
del df
for field in ['id', 'name', 'followers_count', 'friends_count','statuses_count', 'description']:
tweetdetails['user_%s' % field] = tweetdetails.user.map(lambda x: x[field])
del tweetdetails['user']
In [40]:
tweetdetails.to_excel('data/lis_tweetdetails.xlsx')
In [13]:
def get_user(screen_name):
try:
user = api.get_user(screen_name=screen_name)
return None
except tweepy.TweepError, error:
return error.message[0]['message']
df['updated_error'] = df.index.map(get_user)
In [16]:
print len(df)
print len(df[df.updated_error.isnull()])
In [41]:
litecon = lite.connect('new_yorker_2.0.db')
In [29]:
import pandas as pd
user_df = pd.read_sql("SELECT user_id, screen_name, user_object, timeline, timeline_error, timeline_modified, user_modified FROM users", litecon, index_col='user_id')
user_df.index = user_df.index.astype(int)
In [4]:
s = user_df.sample(1)
t = json.loads(s.iloc[0]['timeline'])
In [19]:
with litecon:
litecur = litecon.cursor()
litecur.execute('SELECT tweet_id, tweet FROM sample WHERE user_id IS NULL AND tweet IS NOT NULL')
sampled = litecur.fetchall()
for s in sampled:
tweet_id = s[0]
tweet = json.loads(s[1])
litecur.execute('UPDATE sample SET user_id = ? WHERE tweet_id = ?', (tweet['user']['id_str'], tweet_id))
In [39]:
results = api.search('washingtonpost.com AND science filter:links', rpp=5) #
results[0]._json['entities']['urls']
Out[39]:
In [40]:
for r in results:
for u in r._json['entities']['urls']:
print u['expanded_url']
print
In [36]:
for r in results:
for u in r._json['entities']['urls']:
unshortened_uri, status = unshortenit.unshorten_only(u['expanded_url'])
print unshortened_uri, status
print
In [42]:
query=tweepy.Cursor(api.search, q='washingtonpost.com AND science filter:links').items(10)
tweets= [status._json for status in query]
tweets=pd.DataFrame(tweets)
users=pd.DataFrame(tweets["user"].to_dict()).T
users=users.rename(columns= lambda x: 'user:'+x)
tweets=pd.concat([tweets,users],axis=1)
return tweets
In [43]:
pd.DataFrame(tweets)
Out[43]:
In [42]:
since_id = '652417479537479680'
mentions = api.mentions_timeline(count=200, since_id=since_id, include_rts=0)
for i, m in enumerate(mentions):
tweet = api.get_status(id=m.id_str)
with litecon:
litecur = litecon.cursor()
litecur.execute('INSERT INTO response_data (user_id_str, tweet_id, time_received, tweet_text, tweet) VALUES (?,?,?,?,?)', (tweet.user.id_str, tweet.id_str, tweet.created_at, tweet.text, json.dumps(tweet._json)))
litecon.commit()
if i == 0:
since_id = m.id_str
In [47]:
status.user.screen_name
Out[47]:
In [2]:
api.followers_ids(id=2270698615)
Out[2]:
In [4]:
u = api.get_user(2270698615)
u.screen_name
Out[4]:
In [24]:
import botornot
twitter_app_auth = {
'consumer_key': consumer_key,
'consumer_secret': consumer_secret,
'access_token': access_token,
'access_token_secret': access_token_secret,
'wait_on_rate_limit': True,
'wait_on_rate_limit_notify': True
}
bon = botornot.BotOrNot(**twitter_app_auth)
bon.twitter_api.wait_on_rate_limit_notify = True
In [26]:
bon.twitter_api.wait_on_rate_limit_notify = True
Out[26]:
In [ ]:
In [37]:
import tweepy
import re
import json
import sqlite3 as lite
import datetime, time, os, sys
import argparse, ConfigParser
Config = ConfigParser.ConfigParser()
Config.read('config.cnf')
litecon = lite.connect('data/twitter.db')
In [61]:
user = api.get_user('juancommander')
user_id = user.id
ids = []
for page in tweepy.Cursor(api.followers_ids, id=user_id).pages():
ids.extend(page)
len(ids)
Out[61]:
In [17]:
Config = configparser.ConfigParser()
Config.read('config.cnf')
litecon = lite.connect('data/lis.db')
with litecon:
# set up SQL tables
litecur = litecon.cursor()
# the sample, with two columns for either the Tweet itself, or the error in trying to retrieve it
litecur.execute("CREATE TABLE IF NOT EXISTS sample (doi TEXT, old_screen_name TEXT, tweet_id TEXT, tweet TEXT, error TEXT, modified TEXT)")
litecur.execute("CREATE INDEX IF NOT EXISTS sample_old_screen_name ON sample (old_screen_name)")
litecur.execute("CREATE INDEX IF NOT EXISTS sample_tweet_id ON sample (tweet_id)")
litecur.execute("CREATE INDEX IF NOT EXISTS sample_modified ON sample (modified)")
# the users that were found
litecur.execute("CREATE TABLE IF NOT EXISTS users (user_id TEXT, screen_name TEXT, user_object TEXT, timeline TEXT, timeline_error TEXT, timeline_modified TEXT, user_modified TEXT)")
litecur.execute("CREATE UNIQUE INDEX IF NOT EXISTS users_user_id ON users (user_id)")
litecur.execute("CREATE INDEX IF NOT EXISTS users_screen_name ON users (screen_name)")
litecur.execute("CREATE TABLE IF NOT EXISTS friends (user_id TEXT, friend_id TEXT, modified TEXT)")
litecur.execute("CREATE INDEX IF NOT EXISTS friends_user_id ON friends (user_id)")
litecur.execute("CREATE UNIQUE INDEX IF NOT EXISTS friends_user_friend_id ON friends (user_id, friend_id)")
litecur.execute("CREATE TABLE IF NOT EXISTS followers (user_id TEXT, follower_id TEXT, modified TEXT)")
litecur.execute("CREATE INDEX IF NOT EXISTS followers_user_id ON followers (user_id)")
litecur.execute("CREATE UNIQUE INDEX IF NOT EXISTS followers_user_follower_id ON followers (user_id, follower_id)")
consumer_key = Config.get('twittersfupubresearch', 'consumer_key')
consumer_secret = Config.get('twittersfupubresearch', 'consumer_secret')
access_token = Config.get('twittersfupubresearch', 'access_token')
access_token_secret = Config.get('twittersfupubresearch', 'access_token_secret')
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
# set up access to the Twitter API
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
def load_file(filename):
with open(filename, 'r') as f:
l = f.readline()
with litecon:
litecur = litecon.cursor()
for l in f:
l = [x.strip('"') for x in l.strip().split('\t')]
doi = l[0]
screenname = l[1]
tweet_id = l[2]
try:
litecur.execute('INSERT INTO sample (doi, old_screen_name, tweet_id) VALUES (?, ?, ?)', (doi, screenname, tweet_id))
except lite.IntegrityError:
# don't worry about duplicates
pass
def __save_tweet(tweet_id, tweet, error = None):
'''
Do the actual SQLite update with the info collected
'''
now = datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")
with litecon:
litecur = litecon.cursor()
if error:
try:
m = error[0][0]['message']
except:
m = str(error)
litecur.execute('UPDATE sample SET error = ?, modified = ? WHERE tweet_id = ?', (m, now, tweet_id))
else:
litecur.execute('UPDATE sample SET tweet = ?, modified = ? WHERE tweet_id = ?', (json.dumps(tweet._json), now, tweet_id))
try:
litecur.execute('INSERT INTO users (user_id, screen_name, user_object, user_modified) VALUES (?, ?, ?, ?)', (tweet.user.id, tweet.user.screen_name, json.dumps(tweet.user._json), now))
except lite.IntegrityError:
# don't worry about duplicates
pass
def __save_timeline(user_id, timeline, error = None):
'''
Do the actual SQLite update with the info collected
'''
now = datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")
with litecon:
litecur = litecon.cursor()
if error:
try:
m = error[0][0]['message']
except:
m = str(error)
litecur.execute('UPDATE users SET timeline_error = ?, timeline_modified = ? WHERE user_id = ?', (m, now, user_id))
else:
litecur.execute('UPDATE users SET timeline = ?, timeline_modified = ? WHERE user_id = ?', (json.dumps([s._json for s in timeline]), now, user_id))
def get_tweets_in_sample():
'''
Find all the tweets in the sample that have not been fetched yet
and make individual calls to find the users associated with them
'''
with litecon:
litecur = litecon.cursor()
litecur.execute("SELECT tweet_id, old_screen_name FROM sample WHERE tweet IS NULL")
sampled = litecur.fetchall()
for s in sampled:
tweet_id = s[0]
try:
tweet = api.get_status(tweet_id)
__save_tweet(tweet_id, tweet)
except tweepy.TweepError as error:
# bit hacky, we're passing a tweet_id instead of a tweet here
try:
user = api.get_user(screen_name=s[1])
try:
error[0][0]['message'] += '; Found by screen_name'
__save_tweet(tweet_id, None, error)
now = datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")
litecur.execute('INSERT INTO users (user_id, screen_name, user_object, user_modified) VALUES (?, ?, ?, ?)', (user.id, user.screen_name, json.dumps(user._json), now))
except lite.IntegrityError:
# don't worry about duplicates
pass
except tweepy.TweepError as error2:
__save_tweet(tweet_id, None, error) # leave the original error
# now try all the errors one more time
litecur.execute('SELECT tweet_id FROM sample WHERE tweet IS NULL AND error IS NOT NULL')
sampled = litecur.fetchall()
def get_tweets_in_sample_batch():
'''
Find all the tweets in the sample that have not been fetched yet
and make a batch call to find the users associated with them
'''
with litecon:
litecur = litecon.cursor()
litecur.execute('SELECT tweet_id FROM sample WHERE tweet IS NULL AND error IS NULL')
i = 0
while (True):
tweet_ids = [t[0] for t in litecur.fetchmany(100)]
if not tweet_ids: break
try:
statuses = api.statuses_lookup(tweet_ids)
for tweet in statuses:
__save_tweet(tweet.id, tweet)
i+=1
except tweepy.TweepError as error:
print( error)
exit(1)
print("Done %s" % i)
def get_timelines_batch():
'''
Get all the timeline JSON objects for users we know exist (we've saved)
'''
while (True):
with litecon:
litecur = litecon.cursor()
litecur.execute('SELECT u.user_id FROM users u WHERE timeline IS NULL AND timeline_error IS NULL')
# go 100 at a time so we're not hitting the DB so much
user_ids = [u[0] for u in litecur.fetchmany(100)]
if not user_ids: break
for user_id in user_ids:
try:
timeline = api.user_timeline(user_id)
__save_timeline(user_id, timeline)
except tweepy.TweepError as error:
__save_timeline(user_id, None, error)
def __save_network(endpoint, user_id, ids, error = None):
'''
Do the actual SQLite update with the info collected
'''
now = datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")
with litecon:
litecur = litecon.cursor()
if error:
try:
m = error[0][0]['message']
except:
m = str(error)
print( "Error: ", user_id, m)
litecur.execute('INSERT INTO %s (user_id, %s_id, modified) VALUES (?, ?, ?)' % (endpoint, endpoint[:-1]), (user_id, -1, now))
else:
print( 'saving', len(ids))
for f in ids:
try:
litecur.execute('INSERT INTO %s (user_id, %s_id, modified) VALUES (?, ?, ?)' % (endpoint, endpoint[:-1]), (user_id, f, now))
except lite.IntegrityError:
pass # ignore duplicates, they wont change the network
def get_friends(user_id = None):
get_network('friends', user_id)
def get_followers(user_id = None):
get_network('followers', user_id)
def get_network(endpoint, user_id = None):
'''
Get the friends/followers list for all users (or for a specific user
'''
if user_id is None:
while (True):
with litecon:
litecur = litecon.cursor()
litecur.execute('SELECT u.user_id FROM users u LEFT JOIN %s f ON (u.user_id = f.user_id) WHERE f.user_id IS NULL' % endpoint)
# go 100 at a time so we're not hitting the DB so much
users = [u[0] for u in litecur.fetchmany(100)]
if not users: break
for user_id in users:
ids = get_network(endpoint, user_id)
else:
try:
ids = []
# a user_id was passed in, fetch it and return a friends list
if endpoint == 'friends':
for page in tweepy.Cursor(api.friends_ids, id=user_id).pages():
ids.extend(page)
elif endpoint == 'followers':
for page in tweepy.Cursor(api.followers_ids, id=user_id).pages():
ids.extend(page)
# put in something so that we know we've gone after this user
if len(ids) == 0:
__save_network(endpoint, user_id, None, 'No %s found' % endpoint)
__save_network(endpoint, user_id, ids)
except tweepy.TweepError as error:
__save_network(endpoint, user_id, None, error)
In [22]:
err = None
tweet_id = '600296933484326913';
try:
tweet = api.get_status(tweet_id)
except tweepy.TweepError as error:
err = error
In [35]:
err.args[0][0]['message']
Out[35]:
In [116]:
screen_names = ['sage_time', 'RamonDavisMark', 'HarrySpoelstra', 'NatRevNeurol', 'mzkhalil', 'Bill_Bl4ck', 'ewydh', 'annabelgillfi', 'viogibsix', 'SaraDivinorum', 'Migraine_Wisdom', 'TheLancet']
print len(screen_names)
In [119]:
user_ids = []
for screen_name in screen_names:
try:
user = api.get_user(screen_name)
user_ids.append(user.id)
except tweepy.TweepError, error:
print screen_name, error
In [118]:
for user_id in user_ids:
get_followers(user_id)
get_friends(user_id)
In [120]:
0+ 458+ 927+ 122+ 400+ 223+ 179+ 1095+ 545+ 283
Out[120]:
In [14]:
with litecon:
litecur = litecon.cursor()
litecur.execute('SELECT user_id, screen_name FROM users WHERE botornot IS NULL LIMIT 2')
fetchedmany = litecur.fetchmany(100)
# go 100 at a time so we're not hitting the DB so much
# users = [(u[0], u[1]) for u in ]
In [48]:
import pandas as pd
def tweetHarvester(thisString,hits):
query=tweepy.Cursor(api.search, q=thisString+' filter:links').items(hits)
tweets= [status._json for status in query]
tweets=pd.DataFrame(tweets)
entities=pd.DataFrame(tweets["entities"].to_dict()).T
entities=entities.rename(columns= lambda x: 'entities:'+x)
tweets=pd.concat([tweets,entities],axis=1)
metadata=pd.DataFrame(tweets["metadata"].to_dict()).T
metadata=metadata.rename(columns= lambda x: 'metadata:'+x)
tweets=pd.concat([tweets,metadata],axis=1)
users=pd.DataFrame(tweets["user"].to_dict()).T
users=users.rename(columns= lambda x: 'user:'+x)
tweets=pd.concat([tweets,users],axis=1)
tweets=tweets.drop(['entities','metadata','user'],axis=1)
return tweets
In [49]:
df = tweetHarvester('newyorker.com', 2)
In [52]:
for c in df.columns:
print c
In [16]:
try:
api.update_status('dup')
except tweepy.TweepError, error:
err = error
In [2]:
screen_names = ['AnandWilson91', 'Firefly_fan', 'OnCritical', 'P1NDSTER', 'Paul_in_Aber', 'Whole9SoPacific', 'albertinquiet', 'johnpane', 'markushinka', 'nacho_zizou', 'satbhambra', 'seitics', 'bemyprimate', 'ACountyGurl', 'All4thelight', 'peter_makin', 'wt3']
In [15]:
responses = {}
for screen_name in screen_names:
try:
responses[screen_name] = api.get_user(screen_name=screen_names)
except tweepy.TweepError, error:
print screen_name, error.message[0]['message']
In [14]:
responses
Out[14]: