In [1]:
%run helper_functions.py
%run filters.py #contain filtration code used in this notebook.
%run plotly_functions.py #wrapper around plot.ly
from datetime import date
from tabulate import tabulate
from collections import Counter
from IPython.display import Image
import math
import string
%matplotlib inline
plt.rcParams["figure.figsize"] = (15,20)
plt.rcParams["xtick.labelsize"] = 16
plt.rcParams["ytick.labelsize"] = 16
plt.rcParams["axes.labelsize"] = 20
plt.rcParams['legend.fontsize'] = 20
plt.style.use('fivethirtyeight')
pd.set_option('display.max_colwidth', -1)
import plotly.plotly as py
import plotly.graph_objs as go
In [2]:
df = unpickle_object("no_duplicates_df.pkl")
In [3]:
df.shape
Out[3]:
In [4]:
df.head()
Out[4]:
In [5]:
percentage_missing(df) # seems that only dates and times are missing for out data! And only 0.2%!
In [6]:
df[df["date"].isnull()].shape
Out[6]:
In [7]:
df[df["time"].isnull()].shape
Out[7]:
In [8]:
all(df[df["time"].isnull()].index == df[df["date"].isnull()].index) #perfect match for indicies!
Out[8]:
In [9]:
df[df["date"].isnull()].iloc[-1, :]
Out[9]:
Our last entry in the dataframe has an index of 1049876
In [10]:
df[df["date"].isnull()].head()
Out[10]:
Our first entry in the dataframe has an index of 1047747
In [11]:
1049876 - 1047747 # this is a range of 2129. Which is larger than the total dimensions missing.
#so we know that the missing values are not consecutive!
Out[11]:
In [12]:
#these are the handles that have missing dates/times
list(set(df[df["date"].isnull()]['handle']))
Out[12]:
Having sampled a large amount of the handles above, I found that most accounts were primarily bots or the accounts were suspended, meaning that they were formerly bots. As such, I will just drop all the rows that have missing data for date and time.
In [13]:
df.dropna(inplace=True)
In [14]:
df.shape
Out[14]:
In [15]:
1286 + 610694 # looks like we dropped the correct rows!
Out[15]:
We now have rows that have a date of 1970! They most have nonsensical tweets! We will drop these too!
In [16]:
df[df['date'] == date(1970,1,1)] #clearly bad rows of data!
Out[16]:
In [17]:
to_drop = df[df['date'] == date(1970,1,1)].index
In [18]:
to_drop
Out[18]:
In [19]:
df.loc[to_drop, :]
Out[19]:
In [20]:
df.drop(to_drop, inplace=True)
In [21]:
df.shape
Out[21]:
In [22]:
610694 - 7 # we dropped the right amount!
Out[22]:
In [23]:
df.head()
Out[23]:
Let's now clean up our tweets further! This will ensure we dont have garbage hashtags or nonsensical words. This will be important for the lemmatization process later on!
In order to ensure that we have fully removed, duplciates, I will again drop duplicates based on clean_tweet_V2 column.
In [24]:
clean_df = filtration_1(df,"clean_tweet_V1", "clean_tweet_V2")
In [25]:
clean_df.head()
Out[25]:
In [26]:
clean_df = filtration_2(clean_df, "clean_tweet_V2")
In [27]:
clean_df.head()
Out[27]:
In [28]:
clean_df.shape
Out[28]:
In [29]:
clean_df.drop_duplicates(subset="clean_tweet_V2", inplace=True)
In [30]:
clean_df.shape #lost around 80K rows! so many duplicates!
Out[30]:
It seems that our handles column does not strictly contain the user names of twitter handles. Rather, they contain tweets! it is likely that this is the result of bots. As such, we will remove these entries from our dataset!
Also, if the handle contains the word bot, we will also remove it!
In [31]:
clean_df.sort_values(by="handle").head(50) #as we can see from the sample, its all nonsense tweets
Out[31]:
In [32]:
clean_df.sort_values(by='handle', inplace=True) #lets prep our dataframe for the cleaning process
clean_df.reset_index(inplace=True)
del clean_df['index']
clean_df.head()
Out[32]:
In [33]:
clean_df.shape
Out[33]:
In [34]:
to_drop = []
for index in clean_df.index:
if clean_df.iloc[index, 1][0] == "_":#some users have _ at the start of their name
continue
if not clean_df.iloc[index, 1][0].isalnum():
to_drop.append(index)
In [35]:
len(to_drop) #we need to drop 212 records
Out[35]:
In [36]:
for handle in clean_df.iloc[to_drop, :]['handle']:
print(handle)
print()
In [37]:
clean_df.drop(to_drop, inplace=True)
In [38]:
clean_df.head()
Out[38]:
In [39]:
clean_df.drop([190], inplace=True)
In [40]:
clean_df.shape
Out[40]:
In [41]:
clean_df.sort_values(by=['date', 'time'], inplace=True)
clean_df.reset_index(inplace=True)
del clean_df['index']
clean_df.head()
Out[41]:
In [42]:
clean_df_tweet_by_day_plot = bar_graph(clean_df.groupby("date").count()['tweet'].index, clean_df.groupby("date").count()['tweet'], "tweets by day", "Date", "No. Tweets", "tweets-by-day-V2")
clean_df_tweet_by_day_plot #plot looks pretty uniform!
Out[42]:
In [43]:
Image("tweets_by_day.png")
Out[43]:
In [44]:
diff_df = pd.DataFrame(clean_df.groupby("date").count()['tweet'].diff())
diff_df_plot = line_graph(list(diff_df.index)[2:], list(diff_df['tweet'])[2:],'Difference in tweets by day', 'Date', 'Difference', 'difference-bar-plot')
diff_df_plot # a couple of spikes!
Out[44]:
In [45]:
Image("difference_by_day.png")
Out[45]:
In [46]:
#Let's find how many unique hashtags we have over a 4 month period
set_of_all_hashtags = set()
counts = {}
for i in range(clean_df.shape[0]):
temp_lst = clean_df.iloc[i, 6].split()
hashtags = [x for x in temp_lst if x.startswith("#")]
for tag in hashtags:
if tag not in counts:
counts[tag] = 1
else:
counts[tag] += 1
set_of_all_hashtags.add(tag)
print("There are {} unique hashtags over the course of 4 months".format(len(set_of_all_hashtags)))
In [47]:
# let's visualize the top 50 hashtags
counts_lst = []
for k, v in counts.items():
counts_lst.append((k, v))
top_50_hashtags = pd.DataFrame(sorted(counts_lst, key=lambda x: x[1], reverse=True)[:50])
top_50_hashtags.set_index(0, inplace=True)
top_50_hashtags.sort_values(by=1, inplace=True)
top_50_hashtags.index.names = ['hashtag']
top_50_hashtags = top_50_hashtags.rename(columns = {1:'count'})
top_50_hashtags_plot = horizontal_bar_graph(top_50_hashtags['count'], top_50_hashtags.index, 'Top 50 hashtags by count', 'Count', 'Hashtag','top-50-hashtags-barh')
top_50_hashtags_plot
Out[47]:
In [48]:
Image("top_50_hashtags.png")
Out[48]:
Lets look at some stats at the day level:
In [49]:
stats = pd.DataFrame(clean_df.groupby("date")['tweet'].size().describe())
stats.drop(["count"], inplace=True)
stats = stats.rename(columns = {"tweet":'tweets_per_day'})
print(tabulate(stats, headers='keys', tablefmt='fancy_grid'))
Now that we have plotted the hashtags. There is no need to keep the hashtags in the corpus of a particular tweet. In fact, keeping the hashtag would serve to only confuse our sentiment calculations.
In [50]:
clean_df['clean_tweet_V2'] = clean_df['clean_tweet_V2'].apply(lambda x: x.replace("#",""))
In [51]:
clean_df.head()
Out[51]:
While it is excellent that we have such a high level of granularity for our time column, it is not needed for our analysis. Rather, it would be excellent if we could place tweets into "hourly" buckets. This way, we can have analysis for both the day level and the hour level!
In [52]:
hours = []
for index in clean_df.index:
hours.append(clean_df.iloc[index, 2].hour)
In [53]:
clean_df.shape[0] == len(hours) #perfect
Out[53]:
In [54]:
clean_df['hour_of_day'] = hours
clean_df = clean_df.set_value(clean_df[clean_df['hour_of_day'] == 0].index, "hour_of_day", 24)
In [55]:
clean_df.head()
Out[55]:
In [56]:
clean_df_tweet_by_hour_plot = line_graph(clean_df.groupby("hour_of_day").count()['tweet'].index, clean_df.groupby("hour_of_day").count()['tweet'], "tweets by hour", "Hour", "No. Tweets", "tweets-by-hour")
clean_df_tweet_by_hour_plot #plot looks pretty uniform!
Out[56]:
In [57]:
Image("tweets_by_hour.png")
Out[57]:
In [58]:
number_of_tweets = []
number_of_users = []
for k, v in Counter(clean_df.groupby("handle").count()['tweet']).items():
number_of_tweets.append(k)
number_of_users.append(v)
scaled_num_users = list(map(lambda x: x/10e3, number_of_users))
tweets_per_user_plot = bar_graph(number_of_tweets, scaled_num_users, "Tweets per user", "Number of tweets per user", "Number of Users (10e3)", "tweet-per-user")
tweets_per_user_plot
Out[58]:
In [59]:
Image("tweets_by_user.png") #the first bar has a value of 5.0471 x 10^3
Out[59]:
This concludes the exploration notebook! In the next notebook, we will gather some additional data and prepare our data for the modelling process!
As far as model building is concerned, we only need the dates, hours and clean_tweet_V2. Everything else is irrelevant. Let's go ahead and make these changes!
In [85]:
clean_df.drop(["handle", "time", "tweet", "tuple_version_tweet", "clean_tweet_V1"], axis=1, inplace=True)
In [86]:
clean_df.head()
Out[86]:
In [87]:
pickle_object(clean_df, "clean_df_NB3_Complete")
In [ ]: