notebook.community

Edit and run



In [ ]:

    
%matplotlib inline



In [ ]:

    
import os
import sys
import json
import bz2
import seaborn
import pandas as pd
from unicode_codes import EMOJI_UNICODE
from twitter_search_funcs import *
from timeit import default_timer as timer



In [ ]:

    
data_path = "/your/data/path/archive-twitter-2016-08/"



In [ ]:

    
# Character to match
match = EMOJI_UNICODE[':pistol:']



In [ ]:

    
# Counters
counter_total_tweets = 0
counter_total_match = 0
counter_total_before = 0
counter_total_after = 0
counterdict_before = {}
counterdict_after = {}
counterdict_lang = {}



In [ ]:

    
# Main search loop
start_t = timer()
for day in range(31):
    day_str = "{:02d}".format(day + 1)
    for hour in range(24):
        hour_str = "{:02d}".format(hour)
        file_path = os.path.join(data_path, day_str, hour_str)
        files = os.listdir(file_path)
        for i, f in enumerate(files):
            progress(i + hour * len(files), 
                     len(files) * 24, 
                     suffix="Searching Day {}, Hour {}".format(day_str, hour_str))
            fbz = bz2.BZ2File(os.path.join(file_path, f), 'rb')
            try:
                fdec = fbz.read()
            except IOError:
                continue
            finally:
                fbz.close()
            fdecutf = fdec.decode('utf-8')

            for t in fdecutf.split('\n'):
                try:
                    tweet = json.loads(t)
                except (ValueError):
                    continue
                if 'delete' in tweet.keys():
                    continue
                counter_total_tweets += 1
                if match in tweet['text']:
                    counter_total_match += 1
                    result = find_context(tweet['text'], match)

                    if result[0] in EMOJI_UNICODE.values():
                        counter_total_before += 1

                        if result[0] in counterdict_before.keys():
                            counterdict_before[result[0]] += 1
                        else:
                            counterdict_before[result[0]] = 1

                    if result[2] in EMOJI_UNICODE.values():
                        counter_total_after += 1
                        
                        if result[2] in counterdict_after.keys():
                            counterdict_after[result[2]] += 1
                        else:
                            counterdict_after[result[2]] = 1
                    
                    try:
                        if tweet['lang'] in counterdict_lang.keys():
                            counterdict_lang[tweet['lang']] += 1
                        else:
                            counterdict_lang[tweet['lang']] = 1
                    except KeyError:
                        continue
end_t = timer()



In [ ]:

    
# Print outputs
print("Elapsed Time    : {:.2f} min".format((end_t - start_t) / 60))
print("Total Tweets    : {:d}".format(counter_total_tweets))
print("Total Matches   : {:d}".format(counter_total_match))
print("Total w/ Before : {:d}".format(counter_total_before))
print("Total w/ After  : {:d}".format(counter_total_after))



In [ ]:

    
# Convert output to dataframe
df_before = pd.DataFrame(list(counterdict_before.items()), columns=['Emoji', 'CountBefore'])
df_after = pd.DataFrame(list(counterdict_after.items()), columns=['Emoji', 'CountAfter'])
df_lang = pd.DataFrame(list(counterdict_lang.items()), columns=['Lang', 'Count'])



In [ ]:

    
# Merge before and after dataframes
df_all = pd.merge(df_before, df_after, on='Emoji', how='outer')



In [ ]:

    
df_all.sort_values('CountBefore', ascending=False).head()



In [ ]:

    
df_all.sort_values('CountAfter', ascending=False).head()



In [ ]:

    
df_lang.sort_values('Count', ascending=False).head()



In [ ]:

    
df_all.sort_values('CountBefore', ascending=False).head(20).plot.bar(y='CountBefore')



In [ ]:

    
df_all.sort_values('CountAfter', ascending=False).head(20).plot.bar(y='CountAfter')



In [ ]:

    
# Export results as CSV files
df_all.to_csv("./alldata.csv")
df_lang.to_csv("./langdata.csv")



In [ ]: