notebook.community

Edit and run



In [4]:

    
# inspired by https://gist.github.com/yanofsky/5436496


'''
@author Manuel Kohler
'''

%matplotlib inline 

import twitter

import tweepy
import csv
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import datetime
from datetime import timezone
from pytz import timezone
from datetime import timedelta
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib
matplotlib.style.use('ggplot')
import json
import os

screen_name = 'realDonaldTrump'
#screen_name = 'rogerfederer'

_count = 200

base_dir = '/Users/kohleman/PycharmProjects/twitter/twitter_api/twitter_api/data/'
raw_data_file = os.path.join(base_dir, 'raw_tweets_{0}.json'.format(screen_name))

# Login to API


auth = OAuthHandler(ckey, csecrect)
auth.set_access_token(atoken, asecret)
api = tweepy.API(auth)


class listener(StreamListener):

    def on_data(self, raw_data):
        print(raw_data)
        return True

    def on_error(self, status_code):
        print(status_code)

        
def extract_json(raw_tweets):
    return [[tweet._json] for tweet in raw_tweets]



In [5]:

    
# Access current stream
twitterStream = Stream (auth, listener())
#twitterStream.filter(track=["#trump"])



In [6]:

    
def get_all_tweets(screen_name):
    # taken some code from https://github.com/adilmoujahid/Twitter_Analytics/blob/master/analyze_tweets.py
    #Twitter only allows access to a users most recent 3240 tweets with this method

    #initialize a list to hold all the tweepy Tweets
    alltweets = []

    #make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = screen_name,count = _count)

    #save most recent tweets
    alltweets.extend(new_tweets)
    
    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1

    #keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        print("getting tweets before {0}".format(oldest))

        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name, count = _count, max_id = oldest)

        #save most recent tweets
        alltweets.extend(new_tweets)

        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1

        print("...{0} tweets downloaded so far".format(len(alltweets)))

    #transform the tweepy tweets into a 2D array that will populate the csv	
    outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets]
    
    #write the csv
    with open(os.path.join(base_dir, '{0}_all_tweets.csv'.format(screen_name)), 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["id","created_at","text"])
        writer.writerows(outtweets)

    return alltweets



In [8]:

    
alltweets = get_all_tweets(screen_name)









    



getting tweets before 821719763214880768
...400 tweets downloaded so far
getting tweets before 808642018612310015
...600 tweets downloaded so far
getting tweets before 795057936565313535
...800 tweets downloaded so far
getting tweets before 789224624320028671
...1000 tweets downloaded so far
getting tweets before 785913754194104319
...1200 tweets downloaded so far
getting tweets before 781785509639118847
...1400 tweets downloaded so far
getting tweets before 774484342030602239
...1600 tweets downloaded so far
getting tweets before 766627569110249471
...1800 tweets downloaded so far
getting tweets before 759191265988653055
...2000 tweets downloaded so far
getting tweets before 754291925616852991
...2200 tweets downloaded so far
getting tweets before 746272130992644095
...2400 tweets downloaded so far
getting tweets before 738598954468659199
...2600 tweets downloaded so far
getting tweets before 732726105837277183
...2800 tweets downloaded so far
getting tweets before 725722027173249023
...3000 tweets downloaded so far
getting tweets before 718409541273194496
...3200 tweets downloaded so far
getting tweets before 711209246419845119
...3243 tweets downloaded so far
getting tweets before 710453513155960833
...3243 tweets downloaded so far



In [9]:

    
# Historic tweets
#http://trumptwitterarchive.com/data/realdonaldtrump/2009.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2010.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2011.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2012.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2013.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2014.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2015.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2016.json
#http://trumptwitterarchive.com/data/realdonaldtrump/2017.json



In [11]:

    
# Accessing https://dev.twitter.com/rest/reference/get/statuses/user_timeline


# include_rts = include retweets
#new_tweets = api.user_timeline(screen_name = screen_name, count=count, include_rts=True)
# max_id_tweets = api.user_timeline(screen_name = screen_name,count=count, include_rts=True, max_id=818643528905621504)


outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"), tweet._json] for tweet in alltweets]

json_tweets = extract_json(alltweets)
# older_tweets = extract_json(max_id_tweets)

with open (raw_data_file, 'w') as jsonfile:
    json.dump(json_tweets, jsonfile, ensure_ascii=False, indent=4, separators=(',', ': '))
#     json.dump(older_tweets, jsonfile, ensure_ascii=False, indent=4, separators=(',', ': '))



In [27]:

    
eastern = timezone('US/Eastern')
zurich = timezone('Europe/Zurich')

# Quick day light saving hack
dls = timedelta(seconds=3600)

fmt = '%Y-%m-%d %H:%M:%S %Z%z'
simple_time_format = '%H:%M'
# Used to put all timestamps in one day
one_day_time_format = '2017-01-01 %H:%M:%S'


print("Got {0} tweets".format(len(outtweets)))

dt_series = []

for tweets in outtweets:
    timestamp = tweets[1]
    # print(timestamp)
    loc_dt = zurich.localize(timestamp) + dls
#     print("Localized Zurich Timezone: {0}".format(loc_dt.strftime(fmt)))    
    eastern_dt = loc_dt.astimezone(eastern)
#     print("Localized Eastern Timezone: {0}".format(eastern_dt.strftime(one_day_time_format)))
    
    one_day_dt =datetime.datetime.strptime(eastern_dt.strftime(one_day_time_format), '%Y-%m-%d %H:%M:%S')
    
    dt_series.append(one_day_dt)
    

series = pd.Series(1, index=dt_series)
binned = series.resample('30T').sum()

counted = Counter(dt_series)
values = []

for key, value in dict(counted).items():
    values.append(value)

df2 = pd.DataFrame({ 'Date_Time' : list(counted),
                     'Count' : values})

binned_df = pd.DataFrame({'Date_Time':binned.index, 'Count':binned.values})

# print(binned.index)
# print(binned.values)

# print(binned_df.Date_Time.dt.time)
binned_df.plot.line(x=binned_df.Date_Time.dt.time, y='Count', marker='o', alpha=0.3, figsize=(20,10))

# print(df2.Date_Time.dt.hour)
df2.plot.line(x=df2.Date_Time.dt.time, y='Count', marker='o', alpha=0.3, figsize=(20,10))

# print(df2.Date_Time.dt.time)
# df2.plot.line(x=df2.Date_Time.dt.time)
# ppd = pd.Series(dt_series)
# print(ppd)









    



Got 3243 tweets






    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x111c0c4a8>



In [ ]:

    
list(counted)



In [ ]:

    
# series = pd.Series(1, index=dt_series)
binned = series.resample('30T').sum()

binned_df = pd.DataFrame({'Date_Time':binned.index, 'Count':binned.values})
print(binned_df)



In [100]:

    
def get_tweet_by_id(id):
    return [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"), tweet._json] for tweet in alltweets if tweet.id_str == id]



In [101]:

    
# favorite_count: Number of likes per tweet
favourite_tweets_list = [tweet.favorite_count for tweet in alltweets]
id_tweets_list = [tweet.id_str for tweet in alltweets]

# print(len(favourite_tweets_list))
# print(id_tweets_list)

df_fav = pd.DataFrame({'likes':favourite_tweets_list, 'id':id_tweets_list}, index=id_tweets_list)
print(df_fav.head())
max_df = df_fav['likes'].max()
print("Most liked tweets got {0} likes".format(max_df))

# df.loc[df['likes'].idxmax()]
max_likes_index = df_fav['likes'].idxmax()
# print(df_fav.loc[max_likes_index])

# most_liked_tweet = api.user_timeline(screen_name = screen_name, count = _count, max_id = oldest)

most_liked_tweet = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"), tweet._json] for tweet in alltweets if tweet.id_str == max_likes_index]

print("Average likes {0}".format(df_fav['likes'].mean()))
df_fav.plot(figsize=(20,10))









    



                                    id   likes
833050081641234435  833050081641234435   69240
832950628750127106  832950628750127106   79521
832945737625387008  832945737625387008  147066
832742165436579840  832742165436579840   86872
832730328108134402  832730328108134402  105603
Most liked tweets got 634311 likes
Average likes 34321.68270120259






    Out[101]:





<matplotlib.axes._subplots.AxesSubplot at 0x11ba26ef0>



In [102]:

    
print(most_liked_tweet[0][1])
print(most_liked_tweet[0][2])









    



2016-11-09 11:36:58
b'Such a beautiful and important evening! The forgotten man and woman will never be forgotten again. We will all come together as never before'



In [103]:

    
df_fav.sort_values(by='likes', ascending=False)









    Out[103]:






  
    
      
      id
      likes
    
  
  
    
      796315640307060738
      796315640307060738
      634311
    
    
      795954831718498305
      795954831718498305
      573937
    
    
      823174199036542980
      823174199036542980
      396734
    
    
      815185071317676033
      815185071317676033
      351584
    
    
      822669114237943808
      822669114237943808
      294355
    
    
      741007091947556864
      741007091947556864
      293634
    
    
      822421390125043713
      822421390125043713
      272571
    
    
      828447350200926212
      828447350200926212
      271899
    
    
      827885966509604865
      827885966509604865
      260054
    
    
      826774668245946368
      826774668245946368
      256665
    
    
      826637556787838976
      826637556787838976
      255100
    
    
      755788382618390529
      755788382618390529
      246242
    
    
      829836231802515457
      829836231802515457
      239413
    
    
      796900183955095552
      796900183955095552
      232033
    
    
      825721153142521858
      825721153142521858
      227870
    
    
      810996052241293312
      810996052241293312
      223977
    
    
      825692045532618753
      825692045532618753
      222839
    
    
      797034721075228672
      797034721075228672
      222160
    
    
      822502270503972872
      822502270503972872
      221831
    
    
      823151124815507460
      823151124815507460
      221293
    
    
      823150055418920960
      823150055418920960
      217161
    
    
      827112633224544256
      827112633224544256
      215658
    
    
      803567993036754944
      803567993036754944
      214494
    
    
      824080766288228352
      824080766288228352
      213355
    
    
      813079058896535552
      813079058896535552
      212948
    
    
      802499192237080576
      802499192237080576
      210375
    
    
      830552079240409089
      830552079240409089
      208847
    
    
      827655062835052544
      827655062835052544
      208181
    
    
      797455295928791040
      797455295928791040
      202151
    
    
      824083821889015809
      824083821889015809
      197791
    
    
      ...
      ...
      ...
    
    
      795120273070686208
      795120273070686208
      0
    
    
      760550754692263936
      760550754692263936
      0
    
    
      785293820258902016
      785293820258902016
      0
    
    
      779348578040963073
      779348578040963073
      0
    
    
      780585254583169078
      780585254583169078
      0
    
    
      743827444960899073
      743827444960899073
      0
    
    
      783488429442994176
      783488429442994176
      0
    
    
      783485558202916865
      783485558202916865
      0
    
    
      780444199737036800
      780444199737036800
      0
    
    
      783481871032123392
      783481871032123392
      0
    
    
      780550185365737472
      780550185365737472
      0
    
    
      780576225848885249
      780576225848885249
      0
    
    
      783481080078749696
      783481080078749696
      0
    
    
      783480864395055104
      783480864395055104
      0
    
    
      794022452233781248
      794022452233781248
      0
    
    
      780583741051768832
      780583741051768832
      0
    
    
      785287884718895104
      785287884718895104
      0
    
    
      780592630585499648
      780592630585499648
      0
    
    
      780590406501150723
      780590406501150723
      0
    
    
      788479634694246400
      788479634694246400
      0
    
    
      788449842854895616
      788449842854895616
      0
    
    
      760289166324215808
      760289166324215808
      0
    
    
      785289956541554689
      785289956541554689
      0
    
    
      785292936095424512
      785292936095424512
      0
    
    
      788077766063390724
      788077766063390724
      0
    
    
      788048800766099456
      788048800766099456
      0
    
    
      715215099082878977
      715215099082878977
      0
    
    
      788036646096822272
      788036646096822272
      0
    
    
      794320826237644805
      794320826237644805
      0
    
    
      778655495037091845
      778655495037091845
      0
    
  

3243 rows × 2 columns



In [106]:

    
second_highest = get_tweet_by_id('795954831718498305')
print(second_highest[0][1])
print(second_highest[0][2])









    



2016-11-08 11:43:14
b'TODAY WE MAKE AMERICA GREAT AGAIN!'



In [ ]:

	id	likes
796315640307060738	796315640307060738	634311
795954831718498305	795954831718498305	573937
823174199036542980	823174199036542980	396734
815185071317676033	815185071317676033	351584
822669114237943808	822669114237943808	294355
741007091947556864	741007091947556864	293634
822421390125043713	822421390125043713	272571
828447350200926212	828447350200926212	271899
827885966509604865	827885966509604865	260054
826774668245946368	826774668245946368	256665
826637556787838976	826637556787838976	255100
755788382618390529	755788382618390529	246242
829836231802515457	829836231802515457	239413
796900183955095552	796900183955095552	232033
825721153142521858	825721153142521858	227870
810996052241293312	810996052241293312	223977
825692045532618753	825692045532618753	222839
797034721075228672	797034721075228672	222160
822502270503972872	822502270503972872	221831
823151124815507460	823151124815507460	221293
823150055418920960	823150055418920960	217161
827112633224544256	827112633224544256	215658
803567993036754944	803567993036754944	214494
824080766288228352	824080766288228352	213355
813079058896535552	813079058896535552	212948
802499192237080576	802499192237080576	210375
830552079240409089	830552079240409089	208847
827655062835052544	827655062835052544	208181
797455295928791040	797455295928791040	202151
824083821889015809	824083821889015809	197791
...	...	...
795120273070686208	795120273070686208	0
760550754692263936	760550754692263936	0
785293820258902016	785293820258902016	0
779348578040963073	779348578040963073	0
780585254583169078	780585254583169078	0
743827444960899073	743827444960899073	0
783488429442994176	783488429442994176	0
783485558202916865	783485558202916865	0
780444199737036800	780444199737036800	0
783481871032123392	783481871032123392	0
780550185365737472	780550185365737472	0
780576225848885249	780576225848885249	0
783481080078749696	783481080078749696	0
783480864395055104	783480864395055104	0
794022452233781248	794022452233781248	0
780583741051768832	780583741051768832	0
785287884718895104	785287884718895104	0
780592630585499648	780592630585499648	0
780590406501150723	780590406501150723	0
788479634694246400	788479634694246400	0
788449842854895616	788449842854895616	0
760289166324215808	760289166324215808	0
785289956541554689	785289956541554689	0
785292936095424512	785292936095424512	0
788077766063390724	788077766063390724	0
788048800766099456	788048800766099456	0
715215099082878977	715215099082878977	0
788036646096822272	788036646096822272	0
794320826237644805	794320826237644805	0
778655495037091845	778655495037091845	0