In [ ]:
%matplotlib inline
In [ ]:
import os
import sys
import json
import bz2
import seaborn
import pandas as pd
from unicode_codes import EMOJI_UNICODE
from twitter_search_funcs import *
from timeit import default_timer as timer
In [ ]:
data_path = "/your/data/path/archive-twitter-2016-08/"
In [ ]:
# Character to match
match = EMOJI_UNICODE[':pistol:']
In [ ]:
# Counters
counter_total_tweets = 0
counter_total_match = 0
counter_total_before = 0
counter_total_after = 0
counterdict_before = {}
counterdict_after = {}
counterdict_lang = {}
In [ ]:
# Main search loop
start_t = timer()
for day in range(31):
day_str = "{:02d}".format(day + 1)
for hour in range(24):
hour_str = "{:02d}".format(hour)
file_path = os.path.join(data_path, day_str, hour_str)
files = os.listdir(file_path)
for i, f in enumerate(files):
progress(i + hour * len(files),
len(files) * 24,
suffix="Searching Day {}, Hour {}".format(day_str, hour_str))
fbz = bz2.BZ2File(os.path.join(file_path, f), 'rb')
try:
fdec = fbz.read()
except IOError:
continue
finally:
fbz.close()
fdecutf = fdec.decode('utf-8')
for t in fdecutf.split('\n'):
try:
tweet = json.loads(t)
except (ValueError):
continue
if 'delete' in tweet.keys():
continue
counter_total_tweets += 1
if match in tweet['text']:
counter_total_match += 1
result = find_context(tweet['text'], match)
if result[0] in EMOJI_UNICODE.values():
counter_total_before += 1
if result[0] in counterdict_before.keys():
counterdict_before[result[0]] += 1
else:
counterdict_before[result[0]] = 1
if result[2] in EMOJI_UNICODE.values():
counter_total_after += 1
if result[2] in counterdict_after.keys():
counterdict_after[result[2]] += 1
else:
counterdict_after[result[2]] = 1
try:
if tweet['lang'] in counterdict_lang.keys():
counterdict_lang[tweet['lang']] += 1
else:
counterdict_lang[tweet['lang']] = 1
except KeyError:
continue
end_t = timer()
In [ ]:
# Print outputs
print("Elapsed Time : {:.2f} min".format((end_t - start_t) / 60))
print("Total Tweets : {:d}".format(counter_total_tweets))
print("Total Matches : {:d}".format(counter_total_match))
print("Total w/ Before : {:d}".format(counter_total_before))
print("Total w/ After : {:d}".format(counter_total_after))
In [ ]:
# Convert output to dataframe
df_before = pd.DataFrame(list(counterdict_before.items()), columns=['Emoji', 'CountBefore'])
df_after = pd.DataFrame(list(counterdict_after.items()), columns=['Emoji', 'CountAfter'])
df_lang = pd.DataFrame(list(counterdict_lang.items()), columns=['Lang', 'Count'])
In [ ]:
# Merge before and after dataframes
df_all = pd.merge(df_before, df_after, on='Emoji', how='outer')
In [ ]:
df_all.sort_values('CountBefore', ascending=False).head()
In [ ]:
df_all.sort_values('CountAfter', ascending=False).head()
In [ ]:
df_lang.sort_values('Count', ascending=False).head()
In [ ]:
df_all.sort_values('CountBefore', ascending=False).head(20).plot.bar(y='CountBefore')
In [ ]:
df_all.sort_values('CountAfter', ascending=False).head(20).plot.bar(y='CountAfter')
In [ ]:
# Export results as CSV files
df_all.to_csv("./alldata.csv")
df_lang.to_csv("./langdata.csv")
In [ ]: