In [1]:
%matplotlib inline
In [2]:
import os
import sys
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from unicode_codes import EMOJI_UNICODE
sns.set(style="white")
from twitter_search_funcs import smoothed_relative_freq
In [3]:
df_all = pd.DataFrame.from_csv("./data/alldata2016.csv")
df_lang = pd.DataFrame.from_csv("./data/langdata2016.csv")
df_allemoji = pd.DataFrame.from_csv("./data/allemojidata2016.csv")
In [4]:
# Totals
total_tweets = 101252136
total_tweets_wEmoji = 17094118
total_matches = 38600
total_wBefore = 23374
total_wAfter = 12317
In [5]:
# Find percentages of before matches, percentages of after matches, and count per million of total tweets searched
df_all['PercentBefore'] = 100. * df_all.CountBefore / df_all.CountBefore.sum()
df_all['PercentAfter'] = 100. * df_all.CountAfter / df_all.CountAfter.sum()
df_all['PPMBefore'] = 1e6 * df_all.CountBefore / total_tweets
df_all['PPMAfter'] = 1e6 * df_all.CountAfter / total_tweets
df_allemoji['Percent'] = 100. * df_allemoji.Count / df_allemoji.Count.sum()
df_allemoji['PPM'] = 1e6 * df_allemoji.Count / total_tweets
In [6]:
# Merge total counts into df_all
df_all_merged = df_all.merge(df_allemoji, on='Emoji', how='left')
In [7]:
# Filter and sort dataframes
filter_to = 200
df_topbefore = df_all_merged.sort_values('CountBefore', ascending=False)[:filter_to]
df_topafter = df_all_merged.sort_values('CountAfter', ascending=False)[1:filter_to + 1]
df_top = df_all_merged.sort_values('Count', ascending=False)[:filter_to]
In [8]:
# Testing of filter
df_topafter.tail()
Out[8]:
In [9]:
print(1e6 * df_all.CountBefore.sum() / total_tweets)
print(1e6 * df_all.CountAfter.sum() / total_tweets)
In [10]:
df_topbefore.iloc[:10].sort_values('CountBefore', ascending=True).plot.barh(x='Emoji',
y='PPMBefore',
figsize=(10,8),
legend=False,
xlim=(0,32))
plt.title('Emoji Appearing Before the Gun Emoji', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count / Million', fontsize=25)
plt.savefig('images/before-bar-10.png')
plt.show()
In [13]:
df_topafter.iloc[:10].sort_values('CountAfter', ascending=True).plot.barh(x='Emoji',
y='PPMAfter',
figsize=(10,8),
legend=False,
xlim=(0,7.5))
plt.title('Emoji Appearing After the Gun Emoji', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count / Million', fontsize=25)
plt.savefig('images/after-bar-10.png')
plt.show()
In [14]:
df_top.iloc[[0,1,2,3,4,5,7,8,9,11]].sort_values('Count', ascending=True).plot.barh(x='Emoji',
y='PPM',
figsize=(10,8),
legend=False,
xlim=(0,50000))
plt.title('Total Emoji Counts', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count / Million', fontsize=25)
plt.savefig('images/total-10.png')
plt.show()
In [15]:
pd.options.display.float_format = '{:.2f}'.format
df_topafter.head()
Out[15]:
In [16]:
pd.options.display.float_format = '{:.2f}'.format
df_topbefore.head()
Out[16]:
In [17]:
pd.options.display.float_format = '{:.2f}'.format
df_top.head()
Out[17]:
In [18]:
# Add smoothed relative frequencies to df_all_merged
df_topafter['Score1'] = smoothed_relative_freq(df_topafter.CountAfter, df_topafter.Count,
df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
N=1)
df_topafter['Score100'] = smoothed_relative_freq(df_topafter.CountAfter, df_topafter.Count,
df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
N=100)
df_topafter['Score10k'] = smoothed_relative_freq(df_topafter.CountAfter, df_topafter.Count,
df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
N=10000)
df_topafter['Score1m'] = smoothed_relative_freq(df_topafter.CountAfter, df_topafter.Count,
df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
N=1000000)
df_topbefore['Score1'] = smoothed_relative_freq(df_topbefore.CountBefore, df_topbefore.Count,
df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
N=1)
df_topbefore['Score100'] = smoothed_relative_freq(df_topbefore.CountBefore, df_topbefore.Count,
df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
N=100)
df_topbefore['Score10k'] = smoothed_relative_freq(df_topbefore.CountBefore, df_topbefore.Count,
df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
N=10000)
df_topbefore['Score1m'] = smoothed_relative_freq(df_topbefore.CountBefore, df_topbefore.Count,
df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
N=1000000)
In [19]:
df_topafter.sort_values('Score100', ascending=False).head(15)
Out[19]:
In [20]:
df_topbefore.sort_values('Score100', ascending=False).head(15)
Out[20]:
In [28]:
df_topbefore.sort_values('Score100', ascending=False).iloc[:10]\
.sort_values('Score100', ascending=True).plot.barh(x='Emoji',
y='Score100',
figsize=(10,8),
legend=False,
xlim=(0,95))
plt.title('Emoji Appearing Before the Gun Emoji', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Normalized Relative Frequency (Smoothing = 100)', fontsize=25)
plt.savefig('images/before-bar-10-NRF100.png')
plt.show()
In [29]:
df_topafter.sort_values('Score100', ascending=False).iloc[:10]\
.sort_values('Score100', ascending=True).plot.barh(x='Emoji',
y='Score100',
figsize=(10,8),
legend=False,
xlim=(0,115))
plt.title('Emoji Appearing After the Gun Emoji', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Normalized Relative Frequency (Smoothing = 100)', fontsize=25)
plt.savefig('images/after-bar-10-NRF100.png')
plt.show()
In [ ]: