notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
import os
import sys
import json
import bz2
import seaborn
import pandas as pd
from unicode_codes import EMOJI_UNICODE
from twitter_search_funcs import *
from timeit import default_timer as timer



In [3]:

    
#data_path = "/your/data/path/archive-twitter-2016-08/"
data_path = "/Volumes/Media HD/Downloads/archive-twitter-2016-08"



In [4]:

    
# Character pairs to match
matchlist = []
matchlist.append(EMOJI_UNICODE[':pistol:'])
matchlist.append(EMOJI_UNICODE[':face_without_mouth:'] + EMOJI_UNICODE[':pistol:'])
matchlist.append(EMOJI_UNICODE[':upside-down_face:'] + EMOJI_UNICODE[':pistol:'])
matchlist.append(EMOJI_UNICODE[':police_officer:'] + EMOJI_UNICODE[':pistol:'])
matchlist.append(EMOJI_UNICODE[':pistol:'] + EMOJI_UNICODE[':police_officer:'])
matchlist.append(EMOJI_UNICODE[':kitchen_knife:'] + EMOJI_UNICODE[':pistol:'] + EMOJI_UNICODE[':bomb:'])
matchlist.append(EMOJI_UNICODE[':rooster:'] + EMOJI_UNICODE[':pistol:'])
matchlist.append(EMOJI_UNICODE[':collision:'] + EMOJI_UNICODE[':pistol:'])
matchlist.append(EMOJI_UNICODE[':broken_heart:'] + EMOJI_UNICODE[':pistol:'])



In [5]:

    
matchlist









    Out[5]:





['🔫', '😶🔫', '🙃🔫', '👮🔫', '🔫👮', '🔪🔫💣', '🐓🔫', '💥🔫', '💔🔫']



In [6]:

    
# Counters
counter_total_tweets = 0
counter_examples = {m:0 for m in matchlist}
max_examples = 200



In [7]:

    
# Main search loop
start_t = timer()
for day in range(0, 31):
    day_str = "{:02d}".format(day + 1)
    for hour in range(0, 24):
        hour_str = "{:02d}".format(hour)
        file_path = os.path.join(data_path, day_str, hour_str)
        files = os.listdir(file_path)
        for i, f in enumerate(files):
            fbz = bz2.BZ2File(os.path.join(file_path, f), 'rb')
            try:
                fdec = fbz.read()
            except IOError:
                continue
            finally:
                fbz.close()
            fdecutf = fdec.decode('utf-8')

            for t in fdecutf.split('\n'):
                try:
                    tweet = json.loads(t)
                except (ValueError):
                    continue
                if 'delete' in tweet.keys():
                    continue
                if 'lang' not in tweet.keys():
                    continue
                counter_total_tweets += 1
                for match in matchlist:
                    if (match in tweet['text']) and \
                        (tweet['lang'] == 'en') and \
                        (counter_examples[match] < max_examples):
                        counter_examples[match] += 1
                        with open('./data/examples_{}.txt'.format(match), 'a') as outfile:
                            outfile.write(str(counter_examples[match]))
                            outfile.write('\n')
                            outfile.write(tweet['text'])
                            outfile.write('\n\n')
                if sum(list(counter_examples.values())) >= max_examples * len(matchlist):
                    break
            else:
                continue
            break
        else:
            continue
        break
    else:
        continue
    break
end_t = timer()



In [8]:

    
# Print outputs
print("Elapsed Time    : {:.2f} min".format((end_t - start_t) / 60))
print("Total Tweets    : {:d}".format(counter_total_tweets))
print(counter_examples)









    



Elapsed Time    : 500.29 min
Total Tweets    : 101248831
{'🔫': 200, '😶🔫': 36, '🙃🔫': 200, '👮🔫': 6, '🔫👮': 19, '🔪🔫💣': 11, '🐓🔫': 1, '💥🔫': 200, '💔🔫': 39}



In [ ]: