notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
import os
import sys
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from unicode_codes import EMOJI_UNICODE
sns.set(style="white")
from twitter_search_funcs import smoothed_relative_freq



In [3]:

    
df_all = pd.DataFrame.from_csv("./data/alldata2016.csv")
df_lang = pd.DataFrame.from_csv("./data/langdata2016.csv")
df_allemoji = pd.DataFrame.from_csv("./data/allemojidata2016.csv")



In [4]:

    
# Totals
total_tweets = 101252136
total_tweets_wEmoji = 17094118
total_matches = 38600
total_wBefore = 23374
total_wAfter = 12317



In [5]:

    
# Find percentages of before matches, percentages of after matches, and count per million of total tweets searched
df_all['PercentBefore'] = 100. * df_all.CountBefore / df_all.CountBefore.sum()
df_all['PercentAfter'] = 100. * df_all.CountAfter / df_all.CountAfter.sum()
df_all['PPMBefore'] = 1e6 * df_all.CountBefore / total_tweets
df_all['PPMAfter'] = 1e6 * df_all.CountAfter / total_tweets

df_allemoji['Percent'] = 100. * df_allemoji.Count / df_allemoji.Count.sum()
df_allemoji['PPM'] = 1e6 * df_allemoji.Count / total_tweets



In [6]:

    
# Merge total counts into df_all
df_all_merged = df_all.merge(df_allemoji, on='Emoji', how='left')



In [7]:

    
# Filter and sort dataframes
filter_to = 200
df_topbefore = df_all_merged.sort_values('CountBefore', ascending=False)[:filter_to]
df_topafter = df_all_merged.sort_values('CountAfter', ascending=False)[1:filter_to + 1]
df_top = df_all_merged.sort_values('Count', ascending=False)[:filter_to]



In [8]:

    
# Testing of filter
df_topafter.tail()









    Out[8]:







  
    
      
      Emoji
      CountBefore
      CountAfter
      PercentBefore
      PercentAfter
      PPMBefore
      PPMAfter
      Count
      Percent
      PPM
    
  
  
    
      485
      🌷
      1.0
      5.0
      0.004278
      0.040594
      0.009876
      0.049382
      61299
      0.137409
      605.409450
    
    
      413
      🔑
      1.0
      5.0
      0.004278
      0.040594
      0.009876
      0.049382
      14623
      0.032779
      144.421645
    
    
      172
      ❄
      1.0
      5.0
      0.004278
      0.040594
      0.009876
      0.049382
      38894
      0.087185
      384.130168
    
    
      330
      ⚒
      5.0
      4.0
      0.021391
      0.032475
      0.049382
      0.039505
      2402
      0.005384
      23.722956
    
    
      456
      ⚠
      1.0
      4.0
      0.004278
      0.032475
      0.009876
      0.039505
      38118
      0.085446
      376.466132



In [9]:

    
print(1e6 * df_all.CountBefore.sum() / total_tweets)
print(1e6 * df_all.CountAfter.sum() / total_tweets)









    



230.84945091923788
121.6468164187667



In [10]:

    
df_topbefore.iloc[:10].sort_values('CountBefore', ascending=True).plot.barh(x='Emoji',
                                                                            y='PPMBefore',
                                                                            figsize=(10,8),
                                                                            legend=False,
                                                                            xlim=(0,32))
plt.title('Emoji Appearing Before the Gun Emoji', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count / Million', fontsize=25)
plt.savefig('images/before-bar-10.png')
plt.show()



In [13]:

    
df_topafter.iloc[:10].sort_values('CountAfter', ascending=True).plot.barh(x='Emoji',
                                                                          y='PPMAfter',
                                                                          figsize=(10,8),
                                                                          legend=False,
                                                                          xlim=(0,7.5))
plt.title('Emoji Appearing After the Gun Emoji', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count / Million', fontsize=25)
plt.savefig('images/after-bar-10.png')
plt.show()



In [14]:

    
df_top.iloc[[0,1,2,3,4,5,7,8,9,11]].sort_values('Count', ascending=True).plot.barh(x='Emoji',
                                                                y='PPM',
                                                                figsize=(10,8),
                                                                legend=False,
                                                                xlim=(0,50000))
plt.title('Total Emoji Counts', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count / Million', fontsize=25)
plt.savefig('images/total-10.png')
plt.show()



In [15]:

    
pd.options.display.float_format = '{:.2f}'.format
df_topafter.head()









    Out[15]:







  
    
      
      Emoji
      CountBefore
      CountAfter
      PercentBefore
      PercentAfter
      PPMBefore
      PPMAfter
      Count
      Percent
      PPM
    
  
  
    
      16
      🔪
      614.00
      735.00
      2.63
      5.97
      6.06
      7.26
      28839
      0.06
      284.82
    
    
      4
      💣
      450.00
      601.00
      1.93
      4.88
      4.44
      5.94
      27518
      0.06
      271.78
    
    
      0
      😂
      3135.00
      569.00
      13.41
      4.62
      30.96
      5.62
      4810304
      10.78
      47508.17
    
    
      29
      😎
      153.00
      242.00
      0.65
      1.96
      1.51
      2.39
      205482
      0.46
      2029.41
    
    
      7
      💥
      1197.00
      224.00
      5.12
      1.82
      11.82
      2.21
      167060
      0.37
      1649.94



In [16]:

    
pd.options.display.float_format = '{:.2f}'.format
df_topbefore.head()









    Out[16]:







  
    
      
      Emoji
      CountBefore
      CountAfter
      PercentBefore
      PercentAfter
      PPMBefore
      PPMAfter
      Count
      Percent
      PPM
    
  
  
    
      0
      😂
      3135.00
      569.00
      13.41
      4.62
      30.96
      5.62
      4810304
      10.78
      47508.17
    
    
      5
      😭
      2096.00
      195.00
      8.97
      1.58
      20.70
      1.93
      1671746
      3.75
      16510.72
    
    
      7
      💥
      1197.00
      224.00
      5.12
      1.82
      11.82
      2.21
      167060
      0.37
      1649.94
    
    
      23
      😊
      767.00
      48.00
      3.28
      0.39
      7.58
      0.47
      626739
      1.40
      6189.88
    
    
      3
      🙃
      680.00
      25.00
      2.91
      0.20
      6.72
      0.25
      132273
      0.30
      1306.37



In [17]:

    
pd.options.display.float_format = '{:.2f}'.format
df_top.head()









    Out[17]:







  
    
      
      Emoji
      CountBefore
      CountAfter
      PercentBefore
      PercentAfter
      PPMBefore
      PPMAfter
      Count
      Percent
      PPM
    
  
  
    
      0
      😂
      3135.00
      569.00
      13.41
      4.62
      30.96
      5.62
      4810304
      10.78
      47508.17
    
    
      53
      ❤
      117.00
      97.00
      0.50
      0.79
      1.16
      0.96
      1766052
      3.96
      17442.12
    
    
      5
      😭
      2096.00
      195.00
      8.97
      1.58
      20.70
      1.93
      1671746
      3.75
      16510.72
    
    
      57
      😍
      281.00
      81.00
      1.20
      0.66
      2.78
      0.80
      1564388
      3.51
      15450.42
    
    
      56
      💕
      217.00
      106.00
      0.93
      0.86
      2.14
      1.05
      902590
      2.02
      8914.28



In [18]:

    
# Add smoothed relative frequencies to df_all_merged
df_topafter['Score1'] = smoothed_relative_freq(df_topafter.CountAfter, df_topafter.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=1)
df_topafter['Score100'] = smoothed_relative_freq(df_topafter.CountAfter, df_topafter.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=100)
df_topafter['Score10k'] = smoothed_relative_freq(df_topafter.CountAfter, df_topafter.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=10000)
df_topafter['Score1m'] = smoothed_relative_freq(df_topafter.CountAfter, df_topafter.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=1000000)

df_topbefore['Score1'] = smoothed_relative_freq(df_topbefore.CountBefore, df_topbefore.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=1)
df_topbefore['Score100'] = smoothed_relative_freq(df_topbefore.CountBefore, df_topbefore.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=100)
df_topbefore['Score10k'] = smoothed_relative_freq(df_topbefore.CountBefore, df_topbefore.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=10000)
df_topbefore['Score1m'] = smoothed_relative_freq(df_topbefore.CountBefore, df_topbefore.Count,
                                               df_topafter.CountAfter.sum(), df_topafter.Count.sum(),
                                               N=1000000)



In [19]:

    
df_topafter.sort_values('Score100', ascending=False).head(15)









    Out[19]:







  
    
      
      Emoji
      CountBefore
      CountAfter
      PercentBefore
      PercentAfter
      PPMBefore
      PPMAfter
      Count
      Percent
      PPM
      Score1
      Score100
      Score10k
      Score1m
    
  
  
    
      16
      🔪
      614.00
      735.00
      2.63
      5.97
      6.06
      7.26
      28839
      0.06
      284.82
      122.14
      110.03
      10.91
      1.11
    
    
      4
      💣
      450.00
      601.00
      1.93
      4.88
      4.44
      5.94
      27518
      0.06
      271.78
      104.66
      93.85
      9.12
      1.09
    
    
      161
      🗡
      31.00
      51.00
      0.13
      0.41
      0.31
      0.50
      2067
      0.00
      20.41
      116.57
      46.71
      1.74
      1.01
    
    
      286
      ⚰
      4.00
      66.00
      0.02
      0.54
      0.04
      0.65
      4840
      0.01
      47.80
      65.00
      39.59
      1.95
      1.01
    
    
      199
      💂
      11.00
      54.00
      0.05
      0.44
      0.11
      0.53
      4767
      0.01
      47.08
      53.99
      32.76
      1.77
      1.01
    
    
      82
      ⛓
      7.00
      26.00
      0.03
      0.21
      0.07
      0.26
      1402
      0.00
      13.85
      86.99
      27.57
      1.38
      1.00
    
    
      212
      🔨
      33.00
      36.00
      0.14
      0.29
      0.33
      0.36
      3450
      0.01
      34.07
      49.61
      26.30
      1.52
      1.01
    
    
      64
      ☠
      66.00
      58.00
      0.28
      0.47
      0.65
      0.57
      7585
      0.02
      74.91
      36.53
      26.00
      1.82
      1.01
    
    
      149
      👮
      53.00
      46.00
      0.23
      0.37
      0.52
      0.45
      6413
      0.01
      63.34
      34.25
      23.20
      1.65
      1.01
    
    
      40
      ⛽
      1.00
      28.00
      0.00
      0.23
      0.01
      0.28
      5525
      0.01
      54.57
      24.18
      15.70
      1.39
      1.00
    
    
      128
      ⛏
      10.00
      13.00
      0.04
      0.11
      0.10
      0.13
      1089
      0.00
      10.76
      55.65
      15.15
      1.19
      1.00
    
    
      179
      🚬
      57.00
      37.00
      0.24
      0.30
      0.56
      0.37
      9180
      0.02
      90.66
      19.27
      14.55
      1.51
      1.01
    
    
      131
      ⚔
      16.00
      29.00
      0.07
      0.24
      0.16
      0.29
      6714
      0.02
      66.31
      20.63
      14.30
      1.40
      1.00
    
    
      147
      💉
      24.00
      41.00
      0.10
      0.33
      0.24
      0.40
      12357
      0.03
      122.04
      15.88
      12.82
      1.55
      1.01
    
    
      203
      🃏
      55.00
      10.00
      0.24
      0.08
      0.54
      0.10
      2583
      0.01
      25.51
      18.36
      8.80
      1.14
      1.00



In [20]:

    
df_topbefore.sort_values('Score100', ascending=False).head(15)









    Out[20]:







  
    
      
      Emoji
      CountBefore
      CountAfter
      PercentBefore
      PercentAfter
      PPMBefore
      PPMAfter
      Count
      Percent
      PPM
      Score1
      Score100
      Score10k
      Score1m
    
  
  
    
      16
      🔪
      614.00
      735.00
      2.63
      5.97
      6.06
      7.26
      28839
      0.06
      284.82
      102.03
      91.93
      9.27
      1.09
    
    
      4
      💣
      450.00
      601.00
      1.93
      4.88
      4.44
      5.94
      27518
      0.06
      271.78
      78.37
      70.30
      7.06
      1.07
    
    
      203
      🃏
      55.00
      10.00
      0.24
      0.08
      0.54
      0.10
      2583
      0.01
      25.51
      100.91
      45.87
      1.80
      1.01
    
    
      7
      💥
      1197.00
      224.00
      5.12
      1.82
      11.82
      2.21
      167060
      0.37
      1649.94
      34.37
      33.74
      12.35
      1.17
    
    
      76
      🐔
      73.00
      6.00
      0.31
      0.05
      0.72
      0.06
      7259
      0.02
      71.69
      48.04
      33.67
      2.04
      1.01
    
    
      64
      ☠
      66.00
      58.00
      0.28
      0.47
      0.65
      0.57
      7585
      0.02
      74.91
      41.57
      29.55
      1.93
      1.01
    
    
      161
      🗡
      31.00
      51.00
      0.13
      0.41
      0.31
      0.50
      2067
      0.00
      20.41
      70.86
      28.63
      1.45
      1.00
    
    
      149
      👮
      53.00
      46.00
      0.23
      0.37
      0.52
      0.45
      6413
      0.01
      63.34
      39.46
      26.68
      1.75
      1.01
    
    
      212
      🔨
      33.00
      36.00
      0.14
      0.29
      0.33
      0.36
      3450
      0.01
      34.07
      45.47
      24.15
      1.47
      1.00
    
    
      3
      🙃
      680.00
      25.00
      2.91
      0.20
      6.72
      0.25
      132273
      0.30
      1306.37
      24.66
      24.10
      7.86
      1.10
    
    
      252
      🔌
      43.00
      11.00
      0.18
      0.09
      0.42
      0.11
      5766
      0.01
      56.95
      35.58
      23.27
      1.61
      1.01
    
    
      179
      🚬
      57.00
      37.00
      0.24
      0.30
      0.56
      0.37
      9180
      0.02
      90.66
      29.69
      22.28
      1.79
      1.01
    
    
      6
      🙂
      348.00
      31.00
      1.49
      0.25
      3.44
      0.31
      77291
      0.17
      763.35
      21.59
      20.77
      4.97
      1.05
    
    
      93
      😲
      123.00
      11.00
      0.53
      0.09
      1.21
      0.11
      28293
      0.06
      279.43
      20.83
      18.82
      2.59
      1.02
    
    
      26
      😐
      250.00
      12.00
      1.07
      0.10
      2.47
      0.12
      65096
      0.15
      642.91
      18.42
      17.60
      3.92
      1.03



In [28]:

    
df_topbefore.sort_values('Score100', ascending=False).iloc[:10]\
            .sort_values('Score100', ascending=True).plot.barh(x='Emoji',
                                                               y='Score100',
                                                               figsize=(10,8),
                                                               legend=False,
                                                               xlim=(0,95))
plt.title('Emoji Appearing Before the Gun Emoji', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Normalized Relative Frequency (Smoothing = 100)', fontsize=25)
plt.savefig('images/before-bar-10-NRF100.png')
plt.show()



In [29]:

    
df_topafter.sort_values('Score100', ascending=False).iloc[:10]\
           .sort_values('Score100', ascending=True).plot.barh(x='Emoji',
                                                               y='Score100',
                                                               figsize=(10,8),
                                                               legend=False,
                                                               xlim=(0,115))
plt.title('Emoji Appearing After the Gun Emoji', fontsize=30)
plt.yticks(fontsize=35, fontname='EmojiOne Color', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Normalized Relative Frequency (Smoothing = 100)', fontsize=25)
plt.savefig('images/after-bar-10-NRF100.png')
plt.show()



In [ ]:

	Emoji	CountBefore	CountAfter	PercentBefore	PercentAfter	PPMBefore	PPMAfter	Count	Percent	PPM
485	🌷	1.0	5.0	0.004278	0.040594	0.009876	0.049382	61299	0.137409	605.409450
413	🔑	1.0	5.0	0.004278	0.040594	0.009876	0.049382	14623	0.032779	144.421645
172	❄	1.0	5.0	0.004278	0.040594	0.009876	0.049382	38894	0.087185	384.130168
330	⚒	5.0	4.0	0.021391	0.032475	0.049382	0.039505	2402	0.005384	23.722956
456	⚠	1.0	4.0	0.004278	0.032475	0.009876	0.039505	38118	0.085446	376.466132

	Emoji	CountBefore	CountAfter	PercentBefore	PercentAfter	PPMBefore	PPMAfter	Count	Percent	PPM
16	🔪	614.00	735.00	2.63	5.97	6.06	7.26	28839	0.06	284.82
4	💣	450.00	601.00	1.93	4.88	4.44	5.94	27518	0.06	271.78
0	😂	3135.00	569.00	13.41	4.62	30.96	5.62	4810304	10.78	47508.17
29	😎	153.00	242.00	0.65	1.96	1.51	2.39	205482	0.46	2029.41
7	💥	1197.00	224.00	5.12	1.82	11.82	2.21	167060	0.37	1649.94