In [1]:

    
%matplotlib inline



In [2]:

    
import os
import sys
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from unicode_codes import EMOJI_UNICODE
sns.set(style="white")
from twitter_search_funcs import smoothed_relative_freq



In [3]:

    
print(sys.version)









    



3.6.4 (default, Mar  7 2018, 12:11:36) 
[GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.39.2)]

Data load and clean



In [4]:

    
# Load data for all years
list_a = []
list_l = []
list_e = []
totals = {}

for year in [2014, 2015, 2016]:
    df_a = pd.read_csv("./data/alldata_{}_repeat.csv".format(year), index_col=0)
    df_l = pd.read_csv("./data/langdata_{}_repeat.csv".format(year), index_col=0)
    df_e = pd.read_csv("./data/allemojidata_{}_repeat.csv".format(year), index_col=0)
    df_a['Year'], df_l['Year'], df_e['Year'] = year, year, year
    list_a.append(df_a)
    list_l.append(df_l)
    list_e.append(df_e)
    
    totals[year] = {}
    with open("./data/log_{}_repeat.txt".format(year)) as f:
        f.readline()
        totals[year]['total_tweets'] = f.readline().strip().split()[-1]
        totals[year]['total_tweets_wEmoji'] = f.readline().strip().split()[-1]
        totals[year]['total_matches'] = f.readline().strip().split()[-1]
        totals[year]['total_wBefore'] = f.readline().strip().split()[-1]
        totals[year]['total_wAfter'] = f.readline().strip().split()[-1]

df_totals = pd.DataFrame.from_dict(totals, orient='index', dtype=int)
df_all = pd.concat(list_a).join(df_totals, how='left', on='Year')
df_lang = pd.concat(list_l)
df_allemoji = pd.concat(list_e).join(df_totals, how='left', on='Year')

emojiCount = df_allemoji.groupby('Year').Count.sum().rename('total_emoji', inplace=True)
df_allemoji = df_allemoji.join(emojiCount, how='left', on='Year')



In [5]:

    
# Find percentages of before matches, percentages of after matches, and count per million of total tweets searched
df_all['PercentBefore'] = 100. * df_all['CountBefore'] / df_all['total_wBefore']
df_all['PercentAfter'] = 100. * df_all['CountAfter'] / df_all['total_wAfter']

df_all['PPMBefore'] = 1e6 * df_all['CountBefore'] / df_all['total_tweets']
df_all['PPMAfter'] = 1e6 * df_all['CountAfter'] / df_all['total_tweets']

df_allemoji['Percent'] = 100. * df_allemoji['Count'] / df_allemoji['total_emoji']
df_allemoji['PPM'] = 1e6 * df_allemoji['Count'] / df_allemoji['total_tweets']



In [6]:

    
# Merge total counts into df_all
drop_cols = ['total_tweets', 'total_tweets_wEmoji', 'total_matches', 'total_wBefore', 'total_wAfter']

df_all_merged = df_all.merge(df_allemoji.drop(drop_cols, axis=1),
                             on=['Emoji', 'Year'],
                             how='outer')



In [7]:

    
# Add smoothed relative frequencies to df_all_merged
for N in [1, 100, 10000, 1000000]:
    logN = np.log10(N)
    df_all_merged['Score{:.0f}_Bef'.format(logN)] = smoothed_relative_freq(
                                                        df_all_merged.CountBefore,
                                                        df_all_merged.Count,
                                                        df_all_merged.total_wBefore,
                                                        df_all_merged.total_emoji,
                                                        N=N)
    df_all_merged['Score{:.0f}_Aft'.format(logN)] = smoothed_relative_freq(
                                                        df_all_merged.CountAfter,
                                                        df_all_merged.Count,
                                                        df_all_merged.total_wAfter,
                                                        df_all_merged.total_emoji,
                                                        N=N)



In [8]:

    
# Add errors
df_all_merged['PPMBefore_err'] = (df_all_merged.PPMBefore * np.sqrt(df_all_merged.CountBefore)
    / df_all_merged.CountBefore)
df_all_merged['PPMAfter_err'] = (df_all_merged.PPMAfter * np.sqrt(df_all_merged.CountAfter)
    / df_all_merged.CountAfter)



In [9]:

    
df_all_merged['Score2_Bef_err'] = df_all_merged.Score2_Bef * np.sqrt((np.sqrt(df_all_merged.CountBefore)
                                                                      / df_all_merged.CountBefore)**2
                                                                   + (np.sqrt(df_all_merged.Count)
                                                                      / df_all_merged.Count)**2)
df_all_merged['Score2_Aft_err'] = df_all_merged.Score2_Aft * np.sqrt((np.sqrt(df_all_merged.CountAfter)
                                                                      / df_all_merged.CountAfter)**2
                                                                   + (np.sqrt(df_all_merged.Count)
                                                                      / df_all_merged.Count)**2)



In [10]:

    
df_all_merged['PPM_err'] = (df_all_merged.PPM * np.sqrt(df_all_merged.Count)
    / df_all_merged.Count)



In [11]:

    
pd.options.display.float_format = '{:.2f}'.format
df_all_merged.head()









    Out[11]:







  
    
      
      Emoji
      CountBefore
      CountAfter
      Year
      total_tweets
      total_tweets_wEmoji
      total_matches
      total_wBefore
      total_wAfter
      PercentBefore
      ...
      Score2_Aft
      Score4_Bef
      Score4_Aft
      Score6_Bef
      Score6_Aft
      PPMBefore_err
      PPMAfter_err
      Score2_Bef_err
      Score2_Aft_err
      PPM_err
    
  
  
    
      0
      😊
      1246.00
      88.00
      2014
      141266004.00
      17774137.00
      82973.00
      48893.00
      27566.00
      2.55
      ...
      0.28
      1.63
      0.61
      1.01
      0.99
      0.25
      0.07
      0.06
      0.03
      5.35
    
    
      1
      🌚
      397.00
      165.00
      2014
      141266004.00
      17774137.00
      82973.00
      48893.00
      27566.00
      0.81
      ...
      2.20
      1.43
      1.26
      1.01
      1.00
      0.14
      0.09
      0.15
      0.17
      2.56
    
    
      2
      💰
      253.00
      241.00
      2014
      141266004.00
      17774137.00
      82973.00
      48893.00
      27566.00
      0.52
      ...
      3.67
      1.23
      1.52
      1.00
      1.01
      0.11
      0.11
      0.14
      0.24
      2.38
    
    
      3
      💥
      2258.00
      311.00
      2014
      141266004.00
      17774137.00
      82973.00
      48893.00
      27566.00
      4.62
      ...
      2.57
      3.92
      1.49
      1.04
      1.01
      0.34
      0.12
      0.22
      0.15
      3.26
    
    
      4
      🙌
      112.00
      66.00
      2014
      141266004.00
      17774137.00
      82973.00
      48893.00
      27566.00
      0.23
      ...
      0.25
      0.62
      0.63
      0.99
      0.99
      0.07
      0.06
      0.02
      0.03
      4.90
    
  

5 rows × 30 columns



In [12]:

    
# Filter and sort dataframes by counts
filter_to = 200
df_topbefore = df_all_merged.groupby('Year').apply(lambda x: x.nlargest(filter_to, columns='CountBefore'))
df_topafter = df_all_merged.groupby('Year').apply(lambda x: x.nlargest(filter_to, columns='CountAfter'))
df_top = df_all_merged.groupby('Year').apply(lambda x: x.nlargest(filter_to, columns='Count'))

Tables



In [13]:

    
year = 2016



In [14]:

    
pd.options.display.float_format = '{:.0f}'.format
df_topafter.loc[year, :'Year'].head(8)









    Out[14]:







  
    
      
      Emoji
      CountBefore
      CountAfter
      Year
    
  
  
    
      1748
      🔫
      nan
      5052
      2016
    
    
      1241
      🔪
      614
      735
      2016
    
    
      1235
      💣
      450
      601
      2016
    
    
      1225
      😂
      3135
      569
      2016
    
    
      1258
      😎
      153
      242
      2016
    
    
      1230
      💥
      1197
      224
      2016
    
    
      1272
      🔥
      222
      207
      2016
    
    
      1236
      😭
      2096
      195
      2016



In [15]:

    
pd.options.display.float_format = '{:.0f}'.format
df_topbefore.loc[year, :'Year'].head(8)









    Out[15]:







  
    
      
      Emoji
      CountBefore
      CountAfter
      Year
    
  
  
    
      1225
      😂
      3135
      569
      2016
    
    
      1236
      😭
      2096
      195
      2016
    
    
      1230
      💥
      1197
      224
      2016
    
    
      1248
      😊
      767
      48
      2016
    
    
      1228
      🙃
      680
      25
      2016
    
    
      1241
      🔪
      614
      735
      2016
    
    
      1252
      😒
      516
      89
      2016
    
    
      1271
      😩
      503
      32
      2016



In [16]:

    
pd.options.display.float_format = '{:.0f}'.format
df_top.loc[year, :'Year'].head(8)









    Out[16]:







  
    
      
      Emoji
      CountBefore
      CountAfter
      Year
    
  
  
    
      1225
      😂
      3135
      569
      2016
    
    
      1278
      ❤
      117
      97
      2016
    
    
      1236
      😭
      2096
      195
      2016
    
    
      1282
      😍
      281
      81
      2016
    
    
      1281
      💕
      217
      106
      2016
    
    
      1446
      ✨
      16
      38
      2016
    
    
      1316
      🏻
      168
      nan
      2016
    
    
      1272
      🔥
      222
      207
      2016



In [17]:

    
pd.options.display.float_format = '{:.2f}'.format
cols = ['Emoji', 'CountBefore', 'CountAfter', 'Year', 'Score2_Bef']
df_topbefore.loc[year][cols].sort_values('Score2_Bef', ascending=False).head(8)









    Out[17]:







  
    
      
      Emoji
      CountBefore
      CountAfter
      Year
      Score2_Bef
    
  
  
    
      1241
      🔪
      614.00
      735.00
      2016
      35.32
    
    
      1235
      💣
      450.00
      601.00
      2016
      27.00
    
    
      1428
      🃏
      55.00
      10.00
      2016
      15.54
    
    
      1230
      💥
      1197.00
      224.00
      2016
      13.35
    
    
      1301
      🐔
      73.00
      6.00
      2016
      12.27
    
    
      1289
      ☠
      66.00
      58.00
      2016
      10.83
    
    
      1385
      🗡
      31.00
      51.00
      2016
      9.75
    
    
      1373
      👮
      53.00
      46.00
      2016
      9.71



In [18]:

    
pd.options.display.float_format = '{:.2f}'.format
cols = ['Emoji', 'CountAfter', 'CountBefore', 'Year', 'Score2_Aft']
df_topafter.loc[year][cols].sort_values('Score2_Aft', ascending=False).head(8)









    Out[18]:







  
    
      
      Emoji
      CountAfter
      CountBefore
      Year
      Score2_Aft
    
  
  
    
      1748
      🔫
      5052.00
      nan
      2016
      305.33
    
    
      1241
      🔪
      735.00
      614.00
      2016
      80.08
    
    
      1235
      💣
      601.00
      450.00
      2016
      68.21
    
    
      1385
      🗡
      51.00
      31.00
      2016
      28.98
    
    
      1511
      ⚰
      66.00
      4.00
      2016
      26.18
    
    
      1424
      💂
      54.00
      11.00
      2016
      21.68
    
    
      1289
      ☠
      58.00
      66.00
      2016
      17.81
    
    
      1438
      🔨
      36.00
      33.00
      2016
      17.05

Plots



In [19]:

    
df_topbefore.loc[year].iloc[:10] \
            .sort_values('CountBefore', ascending=True) \
            .plot.barh(x='Emoji',
                       y='PPMBefore',
                       xerr='PPMBefore_err',
                       figsize=(10,8),
                       legend=False,
                       xlim=(0,32),
                       color='grey')

plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count per million', fontsize=25)
plt.savefig('images/before-bar-10-{}-print.pdf'.format(year), bbox_inches='tight')
plt.show()



In [20]:

    
df_topafter.loc[year].iloc[1:11] \
           .sort_values('CountAfter', ascending=True) \
           .plot.barh(x='Emoji',
                      y='PPMAfter',
                      xerr='PPMAfter_err',
                      figsize=(10,8),
                      legend=False,
                      xlim=(0,8),
                      color='grey')

plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count per million', fontsize=25)
plt.savefig('images/after-bar-10-{}-print.pdf'.format(year), bbox_inches='tight')
plt.show()



In [21]:

    
df_topbefore.loc[year] \
            .sort_values('Score2_Bef', ascending=False).iloc[:10] \
            .sort_values('Score2_Bef', ascending=True) \
            .plot.barh(x='Emoji',
                       y='Score2_Bef',
                       xerr='Score2_Bef_err',
                       figsize=(10,8),
                       legend=False,
                       xlim=(0,38),
                       color='grey')

plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Normalized Relative Frequency (N = 100)', fontsize=25)
plt.savefig('images/before-bar-10-NRF100-{}-print.pdf'.format(year), bbox_inches='tight')
plt.show()



In [22]:

    
df_topafter.loc[year] \
           .sort_values('Score2_Aft', ascending=False).iloc[1:11] \
           .sort_values('Score2_Aft', ascending=True) \
           .plot.barh(x='Emoji',
                      y='Score2_Aft',
                      xerr='Score2_Aft_err',
                      figsize=(10,8),
                      legend=False,
                      xlim=(0,85),
                      color='grey')

plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Normalized Relative Frequency (N = 100)', fontsize=25)
plt.savefig('images/after-bar-10-NRF100-{}-print.pdf'.format(year), bbox_inches='tight')
plt.show()



In [23]:

    
df_top.loc[year] \
      .sort_values('PPM', ascending=False).iloc[:6] \
      .sort_values('PPM', ascending=True) \
      .plot.barh(x='Emoji',
                 y='PPM',
                 xerr='PPM_err',
                 figsize=(10,5),
                 legend=False,
                 xlim=(0,50000),
                 color='grey')

plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count per million', fontsize=25)
plt.savefig('images/total-6-{}-print.pdf'.format(year), bbox_inches='tight')
plt.show()



In [24]:

    
topset = set(df_top.loc[2014].sort_values('PPM', ascending=False).iloc[:8].Emoji.tolist() +
             df_top.loc[2015].sort_values('PPM', ascending=False).iloc[:8].Emoji.tolist() +
             df_top.loc[2016].sort_values('PPM', ascending=False).iloc[:8].Emoji.tolist())



In [26]:

    
plotdf = df_top[df_top.Emoji.isin(list(topset))] \
    .pivot(index='Emoji', columns='Year', values='PPM')
plotdf.columns = ['August 2014', 'August 2015', 'August 2016']
plotdf.sort_values('August 2016', ascending=True).dropna()[-8:].plot.barh(figsize=(10,6.5),
                                                                          xlim=(0,50000))
plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.legend(fontsize=20)
plt.ylabel('')
plt.xlabel('Count per million', fontsize=25)
plt.savefig('images/total-8-allyears-print.pdf', bbox_inches='tight')
plt.show()

Multiplots



In [27]:

    
fig, ax = plt.subplots(1, 3, figsize=(10,8))
plt.tight_layout(pad=2.5)

for i, y in enumerate([2014, 2015, 2016]):
    df_topbefore.loc[y].iloc[:10] \
                .sort_values('CountBefore', ascending=True) \
                .plot.barh(x='Emoji',
                           y='PPMBefore',
                           xerr='PPMBefore_err',
                           legend=False,
                           color='grey',
                           ax=ax[i])
    plt.sca(ax[i])
    plt.title(y, fontsize=25)
    plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
    plt.xticks(fontsize=20)
    plt.ylabel('')

plt.sca(ax[1])
plt.xlabel('Count per million', fontsize=25)
plt.savefig('images/before-bar-10-allyears-print.pdf'.format(year), bbox_inches='tight')
plt.show()



In [28]:

    
fig, ax = plt.subplots(1, 3, figsize=(10,8))
plt.tight_layout(pad=2.5)

for i, y in enumerate([2014, 2015, 2016]):
    df_topafter.loc[y].iloc[1:11] \
               .sort_values('CountAfter', ascending=True) \
               .plot.barh(x='Emoji',
                           y='PPMAfter',
                           xerr='PPMAfter_err',
                           legend=False,
                           color='grey',
                           ax=ax[i])
    plt.sca(ax[i])
    plt.title(y, fontsize=25)
    plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
    plt.xticks(fontsize=20)
    plt.ylabel('')

plt.sca(ax[1])
plt.xlabel('Count per million', fontsize=25)
plt.savefig('images/after-bar-10-allyears-print.pdf'.format(year), bbox_inches='tight')
plt.show()



In [29]:

    
fig, ax = plt.subplots(1, 3, figsize=(10,8))
plt.tight_layout(pad=2.5)

for i, y in enumerate([2014, 2015, 2016]):
    df_topbefore.loc[y] \
                .sort_values('Score2_Bef', ascending=False).iloc[:10] \
                .sort_values('Score2_Bef', ascending=True) \
                .plot.barh(x='Emoji',
                           y='Score2_Bef',
                           xerr='Score2_Bef_err',
                           legend=False,
                           color='grey',
                           ax=ax[i])
    plt.sca(ax[i])
    plt.title(y, fontsize=25)
    plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
    plt.xticks(fontsize=20)
    plt.ylabel('')

plt.sca(ax[1])
plt.xlabel('Normalized Relative Frequency (N = 100)', fontsize=25)
plt.savefig('images/before-bar-10-NRF100-allyears-print.pdf'.format(year), bbox_inches='tight')
plt.show()



In [30]:

    
fig, ax = plt.subplots(1, 3, figsize=(10,8))
plt.tight_layout(pad=2.5)

for i, y in enumerate([2014, 2015, 2016]):
    df_topafter.loc[y] \
               .sort_values('Score2_Aft', ascending=False).iloc[1:11] \
               .sort_values('Score2_Aft', ascending=True) \
               .plot.barh(x='Emoji',
                           y='Score2_Aft',
                           xerr='Score2_Aft_err',
                           legend=False,
                           color='grey',
                           ax=ax[i])
    plt.sca(ax[i])
    plt.title(y, fontsize=25)
    plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
    plt.xticks(fontsize=20)
    plt.ylabel('')

plt.sca(ax[1])
plt.xlabel('Normalized Relative Frequency (N = 100)', fontsize=25)
plt.savefig('images/after-bar-10-NRF100-allyears-print.pdf'.format(year), bbox_inches='tight')
plt.show()



In [ ]:

	Emoji	CountBefore	CountAfter	Year	total_tweets	total_tweets_wEmoji	total_matches	total_wBefore	total_wAfter	PercentBefore	...	Score2_Aft	Score4_Bef	Score4_Aft	Score6_Bef	Score6_Aft	PPMBefore_err	PPMAfter_err	Score2_Bef_err	Score2_Aft_err	PPM_err
0	😊	1246.00	88.00	2014	141266004.00	17774137.00	82973.00	48893.00	27566.00	2.55	...	0.28	1.63	0.61	1.01	0.99	0.25	0.07	0.06	0.03	5.35
1	🌚	397.00	165.00	2014	141266004.00	17774137.00	82973.00	48893.00	27566.00	0.81	...	2.20	1.43	1.26	1.01	1.00	0.14	0.09	0.15	0.17	2.56
2	💰	253.00	241.00	2014	141266004.00	17774137.00	82973.00	48893.00	27566.00	0.52	...	3.67	1.23	1.52	1.00	1.01	0.11	0.11	0.14	0.24	2.38
3	💥	2258.00	311.00	2014	141266004.00	17774137.00	82973.00	48893.00	27566.00	4.62	...	2.57	3.92	1.49	1.04	1.01	0.34	0.12	0.22	0.15	3.26
4	🙌	112.00	66.00	2014	141266004.00	17774137.00	82973.00	48893.00	27566.00	0.23	...	0.25	0.62	0.63	0.99	0.99	0.07	0.06	0.02	0.03	4.90

	Emoji	CountBefore	CountAfter	Year
1748	🔫	nan	5052	2016
1241	🔪	614	735	2016
1235	💣	450	601	2016
1225	😂	3135	569	2016
1258	😎	153	242	2016
1230	💥	1197	224	2016
1272	🔥	222	207	2016
1236	😭	2096	195	2016

	Emoji	CountBefore	CountAfter	Year	Score2_Bef
1241	🔪	614.00	735.00	2016	35.32
1235	💣	450.00	601.00	2016	27.00
1428	🃏	55.00	10.00	2016	15.54
1230	💥	1197.00	224.00	2016	13.35
1301	🐔	73.00	6.00	2016	12.27
1289	☠	66.00	58.00	2016	10.83
1385	🗡	31.00	51.00	2016	9.75
1373	👮	53.00	46.00	2016	9.71

	Emoji	CountAfter	CountBefore	Year	Score2_Aft
1748	🔫	5052.00	nan	2016	305.33
1241	🔪	735.00	614.00	2016	80.08
1235	💣	601.00	450.00	2016	68.21
1385	🗡	51.00	31.00	2016	28.98
1511	⚰	66.00	4.00	2016	26.18
1424	💂	54.00	11.00	2016	21.68
1289	☠	58.00	66.00	2016	17.81
1438	🔨	36.00	33.00	2016	17.05