In [1]:
# General:
import tweepy # To consume Twitter's API
import pandas as pd # To handle data
import numpy as np # For number computing
# For plotting and visualization:
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
# We import our access keys:
from credentials import * # This will allow us to use the keys as variables
# API's setup:
def twitter_setup():
"""
Utility function to setup the Twitter's API
with our access keys provided.
"""
# Authentication and access using keys:
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
# Return API with authentication:
api = tweepy.API(auth)
return api
In [8]:
# We create an extractor object:
extractor = twitter_setup()
# We create a tweet list as follows:
tweets = []
for tweet in tweepy.Cursor(extractor.user_timeline, screen_name="ttcnotices", exclude_replies=True, include_rts=False).items(500):
tweets.append(tweet)
print("Number of tweets extracted: {}.\n".format(len(tweets)))
# We print the most recent 5 tweets:
print("5 recent tweets:\n")
for tweet in tweets[:5]:
print(tweet.text)
print()
In [19]:
# We create a pandas dataframe as follows:
data = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Tweets'])
# We display the first 10 elements of the dataframe:
display(data.head(10))
In [20]:
# Internal methods of a single tweet object:
print(dir(tweets[0]))
In [21]:
# We print info from the first tweet:
print(tweets[0].id)
print(tweets[0].created_at)
print(tweets[0].source)
print(tweets[0].favorite_count)
print(tweets[0].retweet_count)
print(tweets[0].geo)
print(tweets[0].coordinates)
print(tweets[0].entities)
In [22]:
# We add relevant data:
data['len'] = np.array([len(tweet.text) for tweet in tweets])
data['ID'] = np.array([tweet.id for tweet in tweets])
data['Date'] = np.array([tweet.created_at for tweet in tweets])
data['Source'] = np.array([tweet.source for tweet in tweets])
data['Likes'] = np.array([tweet.favorite_count for tweet in tweets])
data['RTs'] = np.array([tweet.retweet_count for tweet in tweets])
In [23]:
# Display of first 10 elements from dataframe:
display(data.head(10))
In [24]:
# We extract the mean of lenghts:
mean = np.mean(data['len'])
print("The lenght's average in tweets: {}".format(mean))
In [25]:
# We extract the tweet with more FAVs and more RTs:
fav_max = np.max(data['Likes'])
rt_max = np.max(data['RTs'])
fav = data[data.Likes == fav_max].index[0]
rt = data[data.RTs == rt_max].index[0]
# Max FAVs:
print("The tweet with more likes is: \n{}".format(data['Tweets'][fav]))
print("Number of likes: {}".format(fav_max))
print("{} characters.\n".format(data['len'][fav]))
# Max RTs:
print("The tweet with more retweets is: \n{}".format(data['Tweets'][rt]))
print("Number of retweets: {}".format(rt_max))
print("{} characters.\n".format(data['len'][rt]))
In [26]:
# We create time series for data:
tlen = pd.Series(data=data['len'].values, index=data['Date'])
tfav = pd.Series(data=data['Likes'].values, index=data['Date'])
tret = pd.Series(data=data['RTs'].values, index=data['Date'])
In [27]:
# Lenghts along time:
tlen.plot(figsize=(16, 4), color='r')
Out[27]:
In [28]:
# Likes vs retweets visualization:
tfav.plot(figsize=(16, 4), label="Likes", legend=True)
tret.plot(figsize=(16, 4), label="Retweets", legend=True)
Out[28]:
In [29]:
# We obtain all possible sources:
sources = []
for source in data['Source']:
if source not in sources:
sources.append(source)
# We print sources list:
print("Creation of content sources:")
for source in sources:
print("* {}".format(source))
In [30]:
# We create a numpy vector mapped to labels:
percent = np.zeros(len(sources))
for source in data['Source']:
for index in range(len(sources)):
if source == sources[index]:
percent[index] += 1
pass
percent /= 100
# Pie chart:
pie_chart = pd.Series(percent, index=sources, name='Sources')
pie_chart.plot.pie(fontsize=11, autopct='%.2f', figsize=(6, 6))
Out[30]:
In [31]:
from textblob import TextBlob
import re
def clean_tweet(tweet):
'''
Utility function to clean the text in a tweet by removing
links and special characters using regex.
'''
return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
def analize_sentiment(tweet):
'''
Utility function to classify the polarity of a tweet
using textblob.
'''
analysis = TextBlob(clean_tweet(tweet))
if analysis.sentiment.polarity > 0:
return 1
elif analysis.sentiment.polarity == 0:
return 0
else:
return -1
In [32]:
# We create a column with the result of the analysis:
data['SA'] = np.array([analize_sentiment(tweet) for tweet in data['Tweets']])
# We display the updated dataframe with the new column:
display(data.head(10))
In [33]:
# We construct lists with classified tweets:
pos_tweets = [tweet for index, tweet in enumerate(
data['Tweets']) if data['SA'][index] > 0]
neu_tweets = [tweet for index, tweet in enumerate(
data['Tweets']) if data['SA'][index] == 0]
neg_tweets = [tweet for index, tweet in enumerate(
data['Tweets']) if data['SA'][index] < 0]
In [34]:
# We print percentages:
print("Percentage of positive tweets: {}%".format(
len(pos_tweets) * 100 / len(data['Tweets'])))
print("Percentage of neutral tweets: {}%".format(
len(neu_tweets) * 100 / len(data['Tweets'])))
print("Percentage de negative tweets: {}%".format(
len(neg_tweets) * 100 / len(data['Tweets'])))
In [ ]: