In [1]:
%matplotlib inline

In [2]:
import os
import sys
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from unicode_codes import EMOJI_UNICODE
sns.set(style="white")
from twitter_search_funcs import smoothed_relative_freq

In [3]:
df_all = pd.DataFrame.from_csv("./data/alldata2016.csv")
df_lang = pd.DataFrame.from_csv("./data/langdata2016.csv")
df_allemoji = pd.DataFrame.from_csv("./data/allemojidata2016.csv")

In [4]:
# Totals
total_tweets = 101252136
total_tweets_wEmoji = 17094118
total_matches = 38600
total_wBefore = 23374
total_wAfter = 12317

In [5]:
# Find percentages of before matches, percentages of after matches, and count per million of total tweets searched
df_all['PercentBefore'] = 100. * df_all.CountBefore / df_all.CountBefore.sum()
df_all['PercentAfter'] = 100. * df_all.CountAfter / df_all.CountAfter.sum()
df_all['PPMBefore'] = 1e6 * df_all.CountBefore / total_tweets
df_all['PPMAfter'] = 1e6 * df_all.CountAfter / total_tweets

df_allemoji['Percent'] = 100. * df_allemoji.Count / df_allemoji.Count.sum()
df_allemoji['PPM'] = 1e6 * df_allemoji.Count / total_tweets

In [6]:
# Merge total counts into df_all
df_all_merged = df_all.merge(df_allemoji, on='Emoji', how='left')

In [7]:
# Filter and sort dataframes
filter_to = 200
df_topbefore = df_all_merged.sort_values('CountBefore', ascending=False)[:filter_to]
df_topafter = df_all_merged.sort_values('CountAfter', ascending=False)[1:filter_to + 1]
df_top = df_all_merged.sort_values('Count', ascending=False)[:filter_to]

In [8]:
# Testing of filter
df_topafter.tail()


Out[8]:
Emoji CountBefore CountAfter PercentBefore PercentAfter PPMBefore PPMAfter Count Percent PPM
485 🌷 1.0 5.0 0.004278 0.040594 0.009876 0.049382 61299 0.137409 605.409450
413 🔑 1.0 5.0 0.004278 0.040594 0.009876 0.049382 14623 0.032779 144.421645
172 1.0 5.0 0.004278 0.040594 0.009876 0.049382 38894 0.087185 384.130168
330 5.0 4.0 0.021391 0.032475 0.049382 0.039505 2402 0.005384 23.722956
456 1.0 4.0 0.004278 0.032475 0.009876 0.039505 38118 0.085446 376.466132

In [9]:
print(1e6 * df_all.CountBefore.sum() / total_tweets)
print(1e6 * df_all.CountAfter.sum() / total_tweets)


230.84945091923788
121.6468164187667

In [10]:
df_topbefore.iloc[:10].sort_values('CountBefore', ascending=True).plot.barh(x='Emoji',
                                                                            y='PPMBefore',
                                                                            figsize=(10,8),
                                                                            legend=False,
                                                                            xlim=(0,32))
plt.title('Emoji Appearing Before the Gun Emoji', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count / Million', fontsize=25)
plt.savefig('images/before-bar-10.png')
plt.show()



In [13]:
df_topafter.iloc[:10].sort_values('CountAfter', ascending=True).plot.barh(x='Emoji',
                                                                          y='PPMAfter',
                                                                          figsize=(10,8),
                                                                          legend=False,
                                                                          xlim=(0,7.5))
plt.title('Emoji Appearing After the Gun Emoji', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count / Million', fontsize=25)
plt.savefig('images/after-bar-10.png')
plt.show()



In [14]:
df_top.iloc[[0,1,2,3,4,5,7,8,9,11]].sort_values('Count', ascending=True).plot.barh(x='Emoji',
                                                                y='PPM',
                                                                figsize=(10,8),
                                                                legend=False,
                                                                xlim=(0,50000))
plt.title('Total Emoji Counts', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count / Million', fontsize=25)
plt.savefig('images/total-10.png')
plt.show()



In [15]:
pd.options.display.float_format = '{:.2f}'.format
df_topafter.head()


Out[15]:
Emoji CountBefore CountAfter PercentBefore PercentAfter PPMBefore PPMAfter Count Percent PPM
16 🔪 614.00 735.00 2.63 5.97 6.06 7.26 28839 0.06 284.82
4 💣 450.00 601.00 1.93 4.88 4.44 5.94 27518 0.06 271.78
0 😂 3135.00 569.00 13.41 4.62 30.96 5.62 4810304 10.78 47508.17
29 😎 153.00 242.00 0.65 1.96 1.51 2.39 205482 0.46 2029.41
7 💥 1197.00 224.00 5.12 1.82 11.82 2.21 167060 0.37 1649.94

In [16]:
pd.options.display.float_format = '{:.2f}'.format
df_topbefore.head()


Out[16]:
Emoji CountBefore CountAfter PercentBefore PercentAfter PPMBefore PPMAfter Count Percent PPM
0 😂 3135.00 569.00 13.41 4.62 30.96 5.62 4810304 10.78 47508.17
5 😭 2096.00 195.00 8.97 1.58 20.70 1.93 1671746 3.75 16510.72
7 💥 1197.00 224.00 5.12 1.82 11.82 2.21 167060 0.37 1649.94
23 😊 767.00 48.00 3.28 0.39 7.58 0.47 626739 1.40 6189.88
3 🙃 680.00 25.00 2.91 0.20 6.72 0.25 132273 0.30 1306.37

In [17]:
pd.options.display.float_format = '{:.2f}'.format
df_top.head()


Out[17]:
Emoji CountBefore CountAfter PercentBefore PercentAfter PPMBefore PPMAfter Count Percent PPM
0 😂 3135.00 569.00 13.41 4.62 30.96 5.62 4810304 10.78 47508.17
53 117.00 97.00 0.50 0.79 1.16 0.96 1766052 3.96 17442.12
5 😭 2096.00 195.00 8.97 1.58 20.70 1.93 1671746 3.75 16510.72
57 😍 281.00 81.00 1.20 0.66 2.78 0.80 1564388 3.51 15450.42
56 💕 217.00 106.00 0.93 0.86 2.14 1.05 902590 2.02 8914.28

In [18]:
# Add smoothed relative frequencies to df_all_merged
df_topafter['Score1'] = smoothed_relative_freq(df_topafter.CountAfter, df_topafter.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=1)
df_topafter['Score100'] = smoothed_relative_freq(df_topafter.CountAfter, df_topafter.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=100)
df_topafter['Score10k'] = smoothed_relative_freq(df_topafter.CountAfter, df_topafter.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=10000)
df_topafter['Score1m'] = smoothed_relative_freq(df_topafter.CountAfter, df_topafter.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=1000000)

df_topbefore['Score1'] = smoothed_relative_freq(df_topbefore.CountBefore, df_topbefore.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=1)
df_topbefore['Score100'] = smoothed_relative_freq(df_topbefore.CountBefore, df_topbefore.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=100)
df_topbefore['Score10k'] = smoothed_relative_freq(df_topbefore.CountBefore, df_topbefore.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=10000)
df_topbefore['Score1m'] = smoothed_relative_freq(df_topbefore.CountBefore, df_topbefore.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=1000000)

In [19]:
df_topafter.sort_values('Score100', ascending=False).head(15)


Out[19]:
Emoji CountBefore CountAfter PercentBefore PercentAfter PPMBefore PPMAfter Count Percent PPM Score1 Score100 Score10k Score1m
16 🔪 614.00 735.00 2.63 5.97 6.06 7.26 28839 0.06 284.82 122.14 110.03 10.91 1.11
4 💣 450.00 601.00 1.93 4.88 4.44 5.94 27518 0.06 271.78 104.66 93.85 9.12 1.09
161 🗡 31.00 51.00 0.13 0.41 0.31 0.50 2067 0.00 20.41 116.57 46.71 1.74 1.01
286 4.00 66.00 0.02 0.54 0.04 0.65 4840 0.01 47.80 65.00 39.59 1.95 1.01
199 💂 11.00 54.00 0.05 0.44 0.11 0.53 4767 0.01 47.08 53.99 32.76 1.77 1.01
82 7.00 26.00 0.03 0.21 0.07 0.26 1402 0.00 13.85 86.99 27.57 1.38 1.00
212 🔨 33.00 36.00 0.14 0.29 0.33 0.36 3450 0.01 34.07 49.61 26.30 1.52 1.01
64 66.00 58.00 0.28 0.47 0.65 0.57 7585 0.02 74.91 36.53 26.00 1.82 1.01
149 👮 53.00 46.00 0.23 0.37 0.52 0.45 6413 0.01 63.34 34.25 23.20 1.65 1.01
40 1.00 28.00 0.00 0.23 0.01 0.28 5525 0.01 54.57 24.18 15.70 1.39 1.00
128 10.00 13.00 0.04 0.11 0.10 0.13 1089 0.00 10.76 55.65 15.15 1.19 1.00
179 🚬 57.00 37.00 0.24 0.30 0.56 0.37 9180 0.02 90.66 19.27 14.55 1.51 1.01
131 16.00 29.00 0.07 0.24 0.16 0.29 6714 0.02 66.31 20.63 14.30 1.40 1.00
147 💉 24.00 41.00 0.10 0.33 0.24 0.40 12357 0.03 122.04 15.88 12.82 1.55 1.01
203 🃏 55.00 10.00 0.24 0.08 0.54 0.10 2583 0.01 25.51 18.36 8.80 1.14 1.00

In [20]:
df_topbefore.sort_values('Score100', ascending=False).head(15)


Out[20]:
Emoji CountBefore CountAfter PercentBefore PercentAfter PPMBefore PPMAfter Count Percent PPM Score1 Score100 Score10k Score1m
16 🔪 614.00 735.00 2.63 5.97 6.06 7.26 28839 0.06 284.82 102.03 91.93 9.27 1.09
4 💣 450.00 601.00 1.93 4.88 4.44 5.94 27518 0.06 271.78 78.37 70.30 7.06 1.07
203 🃏 55.00 10.00 0.24 0.08 0.54 0.10 2583 0.01 25.51 100.91 45.87 1.80 1.01
7 💥 1197.00 224.00 5.12 1.82 11.82 2.21 167060 0.37 1649.94 34.37 33.74 12.35 1.17
76 🐔 73.00 6.00 0.31 0.05 0.72 0.06 7259 0.02 71.69 48.04 33.67 2.04 1.01
64 66.00 58.00 0.28 0.47 0.65 0.57 7585 0.02 74.91 41.57 29.55 1.93 1.01
161 🗡 31.00 51.00 0.13 0.41 0.31 0.50 2067 0.00 20.41 70.86 28.63 1.45 1.00
149 👮 53.00 46.00 0.23 0.37 0.52 0.45 6413 0.01 63.34 39.46 26.68 1.75 1.01
212 🔨 33.00 36.00 0.14 0.29 0.33 0.36 3450 0.01 34.07 45.47 24.15 1.47 1.00
3 🙃 680.00 25.00 2.91 0.20 6.72 0.25 132273 0.30 1306.37 24.66 24.10 7.86 1.10
252 🔌 43.00 11.00 0.18 0.09 0.42 0.11 5766 0.01 56.95 35.58 23.27 1.61 1.01
179 🚬 57.00 37.00 0.24 0.30 0.56 0.37 9180 0.02 90.66 29.69 22.28 1.79 1.01
6 🙂 348.00 31.00 1.49 0.25 3.44 0.31 77291 0.17 763.35 21.59 20.77 4.97 1.05
93 😲 123.00 11.00 0.53 0.09 1.21 0.11 28293 0.06 279.43 20.83 18.82 2.59 1.02
26 😐 250.00 12.00 1.07 0.10 2.47 0.12 65096 0.15 642.91 18.42 17.60 3.92 1.03

In [28]:
df_topbefore.sort_values('Score100', ascending=False).iloc[:10]\
            .sort_values('Score100', ascending=True).plot.barh(x='Emoji',
                                                               y='Score100',
                                                               figsize=(10,8),
                                                               legend=False,
                                                               xlim=(0,95))
plt.title('Emoji Appearing Before the Gun Emoji', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Normalized Relative Frequency (Smoothing = 100)', fontsize=25)
plt.savefig('images/before-bar-10-NRF100.png')
plt.show()



In [29]:
df_topafter.sort_values('Score100', ascending=False).iloc[:10]\
           .sort_values('Score100', ascending=True).plot.barh(x='Emoji',
                                                               y='Score100',
                                                               figsize=(10,8),
                                                               legend=False,
                                                               xlim=(0,115))
plt.title('Emoji Appearing After the Gun Emoji', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Normalized Relative Frequency (Smoothing = 100)', fontsize=25)
plt.savefig('images/after-bar-10-NRF100.png')
plt.show()



In [ ]: