In [1]:
%matplotlib inline
In [2]:
import os
import sys
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from unicode_codes import EMOJI_UNICODE
sns.set(style="white")
from twitter_search_funcs import smoothed_relative_freq
In [3]:
print(sys.version)
In [4]:
# Load data for all years
list_a = []
list_l = []
list_e = []
totals = {}
for year in [2014, 2015, 2016]:
df_a = pd.read_csv("./data/alldata_{}_repeat.csv".format(year), index_col=0)
df_l = pd.read_csv("./data/langdata_{}_repeat.csv".format(year), index_col=0)
df_e = pd.read_csv("./data/allemojidata_{}_repeat.csv".format(year), index_col=0)
df_a['Year'], df_l['Year'], df_e['Year'] = year, year, year
list_a.append(df_a)
list_l.append(df_l)
list_e.append(df_e)
totals[year] = {}
with open("./data/log_{}_repeat.txt".format(year)) as f:
f.readline()
totals[year]['total_tweets'] = f.readline().strip().split()[-1]
totals[year]['total_tweets_wEmoji'] = f.readline().strip().split()[-1]
totals[year]['total_matches'] = f.readline().strip().split()[-1]
totals[year]['total_wBefore'] = f.readline().strip().split()[-1]
totals[year]['total_wAfter'] = f.readline().strip().split()[-1]
df_totals = pd.DataFrame.from_dict(totals, orient='index', dtype=int)
df_all = pd.concat(list_a).join(df_totals, how='left', on='Year')
df_lang = pd.concat(list_l)
df_allemoji = pd.concat(list_e).join(df_totals, how='left', on='Year')
emojiCount = df_allemoji.groupby('Year').Count.sum().rename('total_emoji', inplace=True)
df_allemoji = df_allemoji.join(emojiCount, how='left', on='Year')
In [5]:
# Find percentages of before matches, percentages of after matches, and count per million of total tweets searched
df_all['PercentBefore'] = 100. * df_all['CountBefore'] / df_all['total_wBefore']
df_all['PercentAfter'] = 100. * df_all['CountAfter'] / df_all['total_wAfter']
df_all['PPMBefore'] = 1e6 * df_all['CountBefore'] / df_all['total_tweets']
df_all['PPMAfter'] = 1e6 * df_all['CountAfter'] / df_all['total_tweets']
df_allemoji['Percent'] = 100. * df_allemoji['Count'] / df_allemoji['total_emoji']
df_allemoji['PPM'] = 1e6 * df_allemoji['Count'] / df_allemoji['total_tweets']
In [6]:
# Merge total counts into df_all
drop_cols = ['total_tweets', 'total_tweets_wEmoji', 'total_matches', 'total_wBefore', 'total_wAfter']
df_all_merged = df_all.merge(df_allemoji.drop(drop_cols, axis=1),
on=['Emoji', 'Year'],
how='outer')
In [7]:
# Add smoothed relative frequencies to df_all_merged
for N in [1, 100, 10000, 1000000]:
logN = np.log10(N)
df_all_merged['Score{:.0f}_Bef'.format(logN)] = smoothed_relative_freq(
df_all_merged.CountBefore,
df_all_merged.Count,
df_all_merged.total_wBefore,
df_all_merged.total_emoji,
N=N)
df_all_merged['Score{:.0f}_Aft'.format(logN)] = smoothed_relative_freq(
df_all_merged.CountAfter,
df_all_merged.Count,
df_all_merged.total_wAfter,
df_all_merged.total_emoji,
N=N)
In [8]:
# Add errors
df_all_merged['PPMBefore_err'] = (df_all_merged.PPMBefore * np.sqrt(df_all_merged.CountBefore)
/ df_all_merged.CountBefore)
df_all_merged['PPMAfter_err'] = (df_all_merged.PPMAfter * np.sqrt(df_all_merged.CountAfter)
/ df_all_merged.CountAfter)
In [9]:
df_all_merged['Score2_Bef_err'] = df_all_merged.Score2_Bef * np.sqrt((np.sqrt(df_all_merged.CountBefore)
/ df_all_merged.CountBefore)**2
+ (np.sqrt(df_all_merged.Count)
/ df_all_merged.Count)**2)
df_all_merged['Score2_Aft_err'] = df_all_merged.Score2_Aft * np.sqrt((np.sqrt(df_all_merged.CountAfter)
/ df_all_merged.CountAfter)**2
+ (np.sqrt(df_all_merged.Count)
/ df_all_merged.Count)**2)
In [10]:
df_all_merged['PPM_err'] = (df_all_merged.PPM * np.sqrt(df_all_merged.Count)
/ df_all_merged.Count)
In [11]:
pd.options.display.float_format = '{:.2f}'.format
df_all_merged.head()
Out[11]:
In [12]:
# Filter and sort dataframes by counts
filter_to = 200
df_topbefore = df_all_merged.groupby('Year').apply(lambda x: x.nlargest(filter_to, columns='CountBefore'))
df_topafter = df_all_merged.groupby('Year').apply(lambda x: x.nlargest(filter_to, columns='CountAfter'))
df_top = df_all_merged.groupby('Year').apply(lambda x: x.nlargest(filter_to, columns='Count'))
In [13]:
year = 2016
In [14]:
pd.options.display.float_format = '{:.0f}'.format
df_topafter.loc[year, :'Year'].head(8)
Out[14]:
In [15]:
pd.options.display.float_format = '{:.0f}'.format
df_topbefore.loc[year, :'Year'].head(8)
Out[15]:
In [16]:
pd.options.display.float_format = '{:.0f}'.format
df_top.loc[year, :'Year'].head(8)
Out[16]:
In [17]:
pd.options.display.float_format = '{:.2f}'.format
cols = ['Emoji', 'CountBefore', 'CountAfter', 'Year', 'Score2_Bef']
df_topbefore.loc[year][cols].sort_values('Score2_Bef', ascending=False).head(8)
Out[17]:
In [18]:
pd.options.display.float_format = '{:.2f}'.format
cols = ['Emoji', 'CountAfter', 'CountBefore', 'Year', 'Score2_Aft']
df_topafter.loc[year][cols].sort_values('Score2_Aft', ascending=False).head(8)
Out[18]:
In [19]:
df_topbefore.loc[year].iloc[:10] \
.sort_values('CountBefore', ascending=True) \
.plot.barh(x='Emoji',
y='PPMBefore',
xerr='PPMBefore_err',
figsize=(10,8),
legend=False,
xlim=(0,32),
color='grey')
plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count per million', fontsize=25)
plt.savefig('images/before-bar-10-{}-print.pdf'.format(year), bbox_inches='tight')
plt.show()
In [20]:
df_topafter.loc[year].iloc[1:11] \
.sort_values('CountAfter', ascending=True) \
.plot.barh(x='Emoji',
y='PPMAfter',
xerr='PPMAfter_err',
figsize=(10,8),
legend=False,
xlim=(0,8),
color='grey')
plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count per million', fontsize=25)
plt.savefig('images/after-bar-10-{}-print.pdf'.format(year), bbox_inches='tight')
plt.show()
In [21]:
df_topbefore.loc[year] \
.sort_values('Score2_Bef', ascending=False).iloc[:10] \
.sort_values('Score2_Bef', ascending=True) \
.plot.barh(x='Emoji',
y='Score2_Bef',
xerr='Score2_Bef_err',
figsize=(10,8),
legend=False,
xlim=(0,38),
color='grey')
plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Normalized Relative Frequency (N = 100)', fontsize=25)
plt.savefig('images/before-bar-10-NRF100-{}-print.pdf'.format(year), bbox_inches='tight')
plt.show()
In [22]:
df_topafter.loc[year] \
.sort_values('Score2_Aft', ascending=False).iloc[1:11] \
.sort_values('Score2_Aft', ascending=True) \
.plot.barh(x='Emoji',
y='Score2_Aft',
xerr='Score2_Aft_err',
figsize=(10,8),
legend=False,
xlim=(0,85),
color='grey')
plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Normalized Relative Frequency (N = 100)', fontsize=25)
plt.savefig('images/after-bar-10-NRF100-{}-print.pdf'.format(year), bbox_inches='tight')
plt.show()
In [23]:
df_top.loc[year] \
.sort_values('PPM', ascending=False).iloc[:6] \
.sort_values('PPM', ascending=True) \
.plot.barh(x='Emoji',
y='PPM',
xerr='PPM_err',
figsize=(10,5),
legend=False,
xlim=(0,50000),
color='grey')
plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.xlabel('Count per million', fontsize=25)
plt.savefig('images/total-6-{}-print.pdf'.format(year), bbox_inches='tight')
plt.show()
In [24]:
topset = set(df_top.loc[2014].sort_values('PPM', ascending=False).iloc[:8].Emoji.tolist() +
df_top.loc[2015].sort_values('PPM', ascending=False).iloc[:8].Emoji.tolist() +
df_top.loc[2016].sort_values('PPM', ascending=False).iloc[:8].Emoji.tolist())
In [26]:
plotdf = df_top[df_top.Emoji.isin(list(topset))] \
.pivot(index='Emoji', columns='Year', values='PPM')
plotdf.columns = ['August 2014', 'August 2015', 'August 2016']
plotdf.sort_values('August 2016', ascending=True).dropna()[-8:].plot.barh(figsize=(10,6.5),
xlim=(0,50000))
plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.legend(fontsize=20)
plt.ylabel('')
plt.xlabel('Count per million', fontsize=25)
plt.savefig('images/total-8-allyears-print.pdf', bbox_inches='tight')
plt.show()
In [27]:
fig, ax = plt.subplots(1, 3, figsize=(10,8))
plt.tight_layout(pad=2.5)
for i, y in enumerate([2014, 2015, 2016]):
df_topbefore.loc[y].iloc[:10] \
.sort_values('CountBefore', ascending=True) \
.plot.barh(x='Emoji',
y='PPMBefore',
xerr='PPMBefore_err',
legend=False,
color='grey',
ax=ax[i])
plt.sca(ax[i])
plt.title(y, fontsize=25)
plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.sca(ax[1])
plt.xlabel('Count per million', fontsize=25)
plt.savefig('images/before-bar-10-allyears-print.pdf'.format(year), bbox_inches='tight')
plt.show()
In [28]:
fig, ax = plt.subplots(1, 3, figsize=(10,8))
plt.tight_layout(pad=2.5)
for i, y in enumerate([2014, 2015, 2016]):
df_topafter.loc[y].iloc[1:11] \
.sort_values('CountAfter', ascending=True) \
.plot.barh(x='Emoji',
y='PPMAfter',
xerr='PPMAfter_err',
legend=False,
color='grey',
ax=ax[i])
plt.sca(ax[i])
plt.title(y, fontsize=25)
plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.sca(ax[1])
plt.xlabel('Count per million', fontsize=25)
plt.savefig('images/after-bar-10-allyears-print.pdf'.format(year), bbox_inches='tight')
plt.show()
In [29]:
fig, ax = plt.subplots(1, 3, figsize=(10,8))
plt.tight_layout(pad=2.5)
for i, y in enumerate([2014, 2015, 2016]):
df_topbefore.loc[y] \
.sort_values('Score2_Bef', ascending=False).iloc[:10] \
.sort_values('Score2_Bef', ascending=True) \
.plot.barh(x='Emoji',
y='Score2_Bef',
xerr='Score2_Bef_err',
legend=False,
color='grey',
ax=ax[i])
plt.sca(ax[i])
plt.title(y, fontsize=25)
plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.sca(ax[1])
plt.xlabel('Normalized Relative Frequency (N = 100)', fontsize=25)
plt.savefig('images/before-bar-10-NRF100-allyears-print.pdf'.format(year), bbox_inches='tight')
plt.show()
In [30]:
fig, ax = plt.subplots(1, 3, figsize=(10,8))
plt.tight_layout(pad=2.5)
for i, y in enumerate([2014, 2015, 2016]):
df_topafter.loc[y] \
.sort_values('Score2_Aft', ascending=False).iloc[1:11] \
.sort_values('Score2_Aft', ascending=True) \
.plot.barh(x='Emoji',
y='Score2_Aft',
xerr='Score2_Aft_err',
legend=False,
color='grey',
ax=ax[i])
plt.sca(ax[i])
plt.title(y, fontsize=25)
plt.yticks(fontsize=40, fontname='Twitter Color Emoji', rotation='horizontal')
plt.xticks(fontsize=20)
plt.ylabel('')
plt.sca(ax[1])
plt.xlabel('Normalized Relative Frequency (N = 100)', fontsize=25)
plt.savefig('images/after-bar-10-NRF100-allyears-print.pdf'.format(year), bbox_inches='tight')
plt.show()
In [ ]: