In [4]:
# inspired by https://gist.github.com/yanofsky/5436496


'''
@author Manuel Kohler
'''

%matplotlib inline 

import twitter

import tweepy
import csv
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import datetime
from datetime import timezone
from pytz import timezone
from datetime import timedelta
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib
matplotlib.style.use('ggplot')
import json
import os

screen_name = 'realDonaldTrump'
#screen_name = 'rogerfederer'

_count = 200

base_dir = '/Users/kohleman/PycharmProjects/twitter/twitter_api/twitter_api/data/'
raw_data_file = os.path.join(base_dir, 'raw_tweets_{0}.json'.format(screen_name))

# Login to API


auth = OAuthHandler(ckey, csecrect)
auth.set_access_token(atoken, asecret)
api = tweepy.API(auth)


class listener(StreamListener):

    def on_data(self, raw_data):
        print(raw_data)
        return True

    def on_error(self, status_code):
        print(status_code)

        
def extract_json(raw_tweets):
    return [[tweet._json] for tweet in raw_tweets]

In [5]:
# Access current stream
twitterStream = Stream (auth, listener())
#twitterStream.filter(track=["#trump"])

In [6]:
def get_all_tweets(screen_name):
    # taken some code from https://github.com/adilmoujahid/Twitter_Analytics/blob/master/analyze_tweets.py
    #Twitter only allows access to a users most recent 3240 tweets with this method

    #initialize a list to hold all the tweepy Tweets
    alltweets = []

    #make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = screen_name,count = _count)

    #save most recent tweets
    alltweets.extend(new_tweets)
    
    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1

    #keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        print("getting tweets before {0}".format(oldest))

        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name, count = _count, max_id = oldest)

        #save most recent tweets
        alltweets.extend(new_tweets)

        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1

        print("...{0} tweets downloaded so far".format(len(alltweets)))

    #transform the tweepy tweets into a 2D array that will populate the csv	
    outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets]
    
    #write the csv
    with open(os.path.join(base_dir, '{0}_all_tweets.csv'.format(screen_name)), 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["id","created_at","text"])
        writer.writerows(outtweets)

    return alltweets

In [8]:
alltweets = get_all_tweets(screen_name)


getting tweets before 821719763214880768
...400 tweets downloaded so far
getting tweets before 808642018612310015
...600 tweets downloaded so far
getting tweets before 795057936565313535
...800 tweets downloaded so far
getting tweets before 789224624320028671
...1000 tweets downloaded so far
getting tweets before 785913754194104319
...1200 tweets downloaded so far
getting tweets before 781785509639118847
...1400 tweets downloaded so far
getting tweets before 774484342030602239
...1600 tweets downloaded so far
getting tweets before 766627569110249471
...1800 tweets downloaded so far
getting tweets before 759191265988653055
...2000 tweets downloaded so far
getting tweets before 754291925616852991
...2200 tweets downloaded so far
getting tweets before 746272130992644095
...2400 tweets downloaded so far
getting tweets before 738598954468659199
...2600 tweets downloaded so far
getting tweets before 732726105837277183
...2800 tweets downloaded so far
getting tweets before 725722027173249023
...3000 tweets downloaded so far
getting tweets before 718409541273194496
...3200 tweets downloaded so far
getting tweets before 711209246419845119
...3243 tweets downloaded so far
getting tweets before 710453513155960833
...3243 tweets downloaded so far

In [9]:
# Historic tweets
#http://trumptwitterarchive.com/data/realdonaldtrump/2009.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2010.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2011.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2012.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2013.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2014.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2015.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2016.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2017.json

In [11]:
# Accessing https://dev.twitter.com/rest/reference/get/statuses/user_timeline


# include_rts = include retweets
#new_tweets = api.user_timeline(screen_name = screen_name, count=count, include_rts=True)
# max_id_tweets = api.user_timeline(screen_name = screen_name,count=count, include_rts=True, max_id=818643528905621504)


outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"), tweet._json] for tweet in alltweets]

json_tweets = extract_json(alltweets)
# older_tweets = extract_json(max_id_tweets)

with open (raw_data_file, 'w') as jsonfile:
    json.dump(json_tweets, jsonfile, ensure_ascii=False, indent=4, separators=(',', ': '))
#     json.dump(older_tweets, jsonfile, ensure_ascii=False, indent=4, separators=(',', ': '))

In [27]:
eastern = timezone('US/Eastern')
zurich = timezone('Europe/Zurich')

# Quick day light saving hack
dls = timedelta(seconds=3600)

fmt = '%Y-%m-%d %H:%M:%S %Z%z'
simple_time_format = '%H:%M'
# Used to put all timestamps in one day
one_day_time_format = '2017-01-01 %H:%M:%S'


print("Got {0} tweets".format(len(outtweets)))

dt_series = []

for tweets in outtweets:
    timestamp = tweets[1]
    # print(timestamp)
    loc_dt = zurich.localize(timestamp) + dls
#     print("Localized Zurich Timezone: {0}".format(loc_dt.strftime(fmt)))    
    eastern_dt = loc_dt.astimezone(eastern)
#     print("Localized Eastern Timezone: {0}".format(eastern_dt.strftime(one_day_time_format)))
    
    one_day_dt =datetime.datetime.strptime(eastern_dt.strftime(one_day_time_format), '%Y-%m-%d %H:%M:%S')
    
    dt_series.append(one_day_dt)
    

series = pd.Series(1, index=dt_series)
binned = series.resample('30T').sum()

counted = Counter(dt_series)
values = []

for key, value in dict(counted).items():
    values.append(value)

df2 = pd.DataFrame({ 'Date_Time' : list(counted),
                     'Count' : values})

binned_df = pd.DataFrame({'Date_Time':binned.index, 'Count':binned.values})

# print(binned.index)
# print(binned.values)

# print(binned_df.Date_Time.dt.time)
binned_df.plot.line(x=binned_df.Date_Time.dt.time, y='Count', marker='o', alpha=0.3, figsize=(20,10))

# print(df2.Date_Time.dt.hour)
df2.plot.line(x=df2.Date_Time.dt.time, y='Count', marker='o', alpha=0.3, figsize=(20,10))

# print(df2.Date_Time.dt.time)
# df2.plot.line(x=df2.Date_Time.dt.time)
# ppd = pd.Series(dt_series)
# print(ppd)


Got 3243 tweets
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x111c0c4a8>

In [ ]:
list(counted)

In [ ]:
# series = pd.Series(1, index=dt_series)
binned = series.resample('30T').sum()

binned_df = pd.DataFrame({'Date_Time':binned.index, 'Count':binned.values})
print(binned_df)

In [100]:
def get_tweet_by_id(id):
    return [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"), tweet._json] for tweet in alltweets if tweet.id_str == id]

In [101]:
# favorite_count: Number of likes per tweet
favourite_tweets_list = [tweet.favorite_count for tweet in alltweets]
id_tweets_list = [tweet.id_str for tweet in alltweets]

# print(len(favourite_tweets_list))
# print(id_tweets_list)

df_fav = pd.DataFrame({'likes':favourite_tweets_list, 'id':id_tweets_list}, index=id_tweets_list)
print(df_fav.head())
max_df = df_fav['likes'].max()
print("Most liked tweets got {0} likes".format(max_df))

# df.loc[df['likes'].idxmax()]
max_likes_index = df_fav['likes'].idxmax()
# print(df_fav.loc[max_likes_index])

# most_liked_tweet = api.user_timeline(screen_name = screen_name, count = _count, max_id = oldest)

most_liked_tweet = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"), tweet._json] for tweet in alltweets if tweet.id_str == max_likes_index]

print("Average likes {0}".format(df_fav['likes'].mean()))
df_fav.plot(figsize=(20,10))


                                    id   likes
833050081641234435  833050081641234435   69240
832950628750127106  832950628750127106   79521
832945737625387008  832945737625387008  147066
832742165436579840  832742165436579840   86872
832730328108134402  832730328108134402  105603
Most liked tweets got 634311 likes
Average likes 34321.68270120259
Out[101]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ba26ef0>

In [102]:
print(most_liked_tweet[0][1])
print(most_liked_tweet[0][2])


2016-11-09 11:36:58
b'Such a beautiful and important evening! The forgotten man and woman will never be forgotten again. We will all come together as never before'

In [103]:
df_fav.sort_values(by='likes', ascending=False)


Out[103]:
id likes
796315640307060738 796315640307060738 634311
795954831718498305 795954831718498305 573937
823174199036542980 823174199036542980 396734
815185071317676033 815185071317676033 351584
822669114237943808 822669114237943808 294355
741007091947556864 741007091947556864 293634
822421390125043713 822421390125043713 272571
828447350200926212 828447350200926212 271899
827885966509604865 827885966509604865 260054
826774668245946368 826774668245946368 256665
826637556787838976 826637556787838976 255100
755788382618390529 755788382618390529 246242
829836231802515457 829836231802515457 239413
796900183955095552 796900183955095552 232033
825721153142521858 825721153142521858 227870
810996052241293312 810996052241293312 223977
825692045532618753 825692045532618753 222839
797034721075228672 797034721075228672 222160
822502270503972872 822502270503972872 221831
823151124815507460 823151124815507460 221293
823150055418920960 823150055418920960 217161
827112633224544256 827112633224544256 215658
803567993036754944 803567993036754944 214494
824080766288228352 824080766288228352 213355
813079058896535552 813079058896535552 212948
802499192237080576 802499192237080576 210375
830552079240409089 830552079240409089 208847
827655062835052544 827655062835052544 208181
797455295928791040 797455295928791040 202151
824083821889015809 824083821889015809 197791
... ... ...
795120273070686208 795120273070686208 0
760550754692263936 760550754692263936 0
785293820258902016 785293820258902016 0
779348578040963073 779348578040963073 0
780585254583169078 780585254583169078 0
743827444960899073 743827444960899073 0
783488429442994176 783488429442994176 0
783485558202916865 783485558202916865 0
780444199737036800 780444199737036800 0
783481871032123392 783481871032123392 0
780550185365737472 780550185365737472 0
780576225848885249 780576225848885249 0
783481080078749696 783481080078749696 0
783480864395055104 783480864395055104 0
794022452233781248 794022452233781248 0
780583741051768832 780583741051768832 0
785287884718895104 785287884718895104 0
780592630585499648 780592630585499648 0
780590406501150723 780590406501150723 0
788479634694246400 788479634694246400 0
788449842854895616 788449842854895616 0
760289166324215808 760289166324215808 0
785289956541554689 785289956541554689 0
785292936095424512 785292936095424512 0
788077766063390724 788077766063390724 0
788048800766099456 788048800766099456 0
715215099082878977 715215099082878977 0
788036646096822272 788036646096822272 0
794320826237644805 794320826237644805 0
778655495037091845 778655495037091845 0

3243 rows × 2 columns


In [106]:
second_highest = get_tweet_by_id('795954831718498305')
print(second_highest[0][1])
print(second_highest[0][2])


2016-11-08 11:43:14
b'TODAY WE MAKE AMERICA GREAT AGAIN!'

In [ ]: