In [18]:
import tweepy
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
# Import keys
from credentials import *
In [3]:
# Setup Twitter API connection
def twitter_setup():
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
api = tweepy.API(auth)
return api
In [183]:
# Extract Twitter data
extractor = twitter_setup()
tweets = extractor.user_timeline(screen_name='realdonaldtrump', count=200)
print('Number of tweets extracted: {}.\n'.format(len(tweets)))
# Display 5 most recent tweets
print('5 recent tweets:\n')
for idx, tweet in enumerate(tweets[:10]):
# if tweet.text.startswith('RT'):
print("%d.) %s\n" % (idx+1, tweet.text))
In [ ]:
if tweet.text.startswith
In [184]:
# Create dataframe
df = pd.DataFrame([tweet.text for tweet in tweets], columns=['Tweets'])
df.head(n=10)
Out[184]:
In [6]:
# Explore tweepy object
print(dir(tweets[0]))
In [185]:
# Explore most recent tweet and its properties
print(tweets[0].id)
print(tweets[0].created_at)
print(tweets[0].source)
print(tweets[0].favorite_count)
print(tweets[0].favorited)
print(tweets[0].retweet_count)
print(tweets[0].retweeted)
print(tweets[0].geo)
print(tweets[0].coordinates)
print(tweets[0].entities)
In [186]:
# Add columns to df
df['len'] = np.array([len(tweet.text) for tweet in tweets])
df['ID'] = np.array([tweet.id for tweet in tweets])
df['Date'] = np.array([tweet.created_at for tweet in tweets])
df['Source'] = np.array([tweet.source for tweet in tweets])
df['Likes'] = np.array([tweet.favorite_count for tweet in tweets])
df['Retweets'] = np.array([tweet.retweet_count for tweet in tweets])
In [187]:
# Show first 10 records
df.head()
Out[187]:
In [188]:
# Show datatypes
df.info(null_counts=True, memory_usage='deep')
In [189]:
df.describe()
Out[189]:
In [190]:
# Sort by various ways
df.sort_values(['Likes', 'len'], ascending=False).head()
Out[190]:
In [191]:
# Explore likes histogram
ax = df.Likes.value_counts().hist()
ax.set_title('Number of Likes vs. Number of Tweets')
ax.set_xlabel('No. of Tweets')
ax.set_ylabel('No. of Likes')
Out[191]:
In [194]:
# Which tweets received zero likes?
df[df.Likes == 0].tail()
Out[194]:
In [195]:
# Group by day
df.groupby([df.Date.dt.month, df.Date.dt.day]).count().Tweets
Out[195]:
In [196]:
# key step!
df.index = df['Date']
In [217]:
# convert utc to est? Find out what tz it is first.
df.groupby(df.index.tz_localize('GMT').tz_convert('US/Eastern').hour).count().Tweets.plot(kind='barh')
Out[217]:
In [215]:
df.groupby(df.index.tz_localize('GMT').tz_convert('US/Eastern').hour).count().Tweets
Out[215]:
In [ ]:
df.groupby(df.index.tz_localize('GMT').tz_convert('US/Eastern').hour).count().Tweets.max
In [ ]:
df.groupby(df.index.tz_localize('GMT').tz_convert('US/Eastern').hour).count().Tweets.max
In [150]:
xticks_12 = ['12 AM', '1 AM', '2 AM', '3 AM', '4 AM', '5 AM', '6 AM', '7 AM', '8 AM', '9 AM', '10 AM', '11 AM',
'12 PM', '1 PM', '2 PM', '3 PM', '4 PM', '5 PM', '6 PM', '7 PM', '8 PM', '9 PM', '10 PM', '11 PM']
In [174]:
# Show GMT to EDT
df.index.tz_localize('GMT').tz_convert('US/Eastern')
Out[174]:
In [175]:
# Time format x-axis to 12-o'clock time
xticks = pd.date_range('00:00', '23:00', freq='H', tz='US/Eastern').map(lambda x: pd.datetime.strftime(x, '%I %p'))
xticks
Out[175]:
In [142]:
pd.date_range?
In [176]:
# Convert from GMT to US EST.
ax = df.groupby(df.index.tz_localize('GMT').tz_convert('US/Eastern').hour).count().Tweets.plot(kind='bar')
ax.set_xticks(np.arange(24))
ax.set_xticklabels(xticks, rotation=45)
ax.set_title('Number of Tweets vs. Time')
ax.set_xlabel('Time (h)')
ax.set_ylabel('No. of Tweets')
Out[176]:
In [ ]:
In [137]:
df.groupby(df.index.weekday_name).count().Tweets.sort_values(ascending=False)
Out[137]:
In [48]:
Out[48]:
In [ ]:
df.groupby(df.index.hour).count().Tweets.plot
In [196]:
# Compute metrics
mean = np.mean(df.len)
print('Tweet length average is: {}\n'.format(mean))
In [197]:
# Get most-liked and retweeted tweet
fav_max = np.max(df.Likes)
rt_max = np.max(df.Retweets)
fav_idx = df[df.Likes == fav_max].index[0]
rt_idx = df[df.Retweets == rt_max].index[0]
# Max favorite tweet:
print("Most liked tweet is:")
print("%s" % df['Tweets'][fav_idx])
print("Number of likes: {}".format(fav_max))
print("{} characters.\n".format(df['len'][fav_idx]))
# Max retweeted tweet:
print("Most retweeted tweet is:")
print("%s" % df['Tweets'][rt_idx])
print("Number of likes: {}".format(rt_max))
print("{} characters.\n".format(df['len'][rt_idx]))
In [13]:
# Construct pandas Series objects
time_length = pd.Series(df.len.values, index=df.Date)
time_favorite = pd.Series(df.Likes.values, index=df.Date)
time_retweet = pd.Series(df.Retweets.values, index=df.Date)
In [14]:
# plot Series obj
ax = time_length.plot(figsize=(16,4), color='r', title='Tweet length vs. DateTime')
ax.set_xlabel("DateTime")
ax.set_ylabel("Tweet length")
Out[14]:
In [116]:
# Plot Likes and retweets over time:
time_favorite.plot(figsize=(16,4), label="Likes", legend=True, title="Likes & Retweets vs. DateTime")
ax = time_retweet.plot(figsize=(16,4), label="Retweets", legend=True)
ax.set_xlabel("DateTime")
ax.set_ylabel("Total (k)")
Out[116]:
In [16]:
# Get sources
sources = []
for source in df.Source:
if source not in sources:
sources.append(source)
# Print sources
print("Tweet created using:")
for source in sources:
print("* {}".format(source))
In [17]:
# Create a pie chart of sources:
percent = np.zeros(len(sources))
for source in df.Source:
for idx in range(len(sources)):
if source == sources[idx]:
percent[idx] += 1
pass
percent /= 100
pie_chart = pd.Series(percent, index=sources, name='Platform sources')
pie_chart.plot.pie(fontsize=11, autopct='%.2f', figsize=(6, 6))
Out[17]: