In [1]:
%matplotlib inline
In [2]:
import os
import sys
import json
import bz2
import seaborn
import pandas as pd
from unicode_codes import EMOJI_UNICODE
from twitter_search_funcs import *
from timeit import default_timer as timer
In [3]:
#data_path = "/your/data/path/archive-twitter-2016-08/"
data_path = "/Volumes/Media HD/Downloads/archive-twitter-2016-08"
In [4]:
# Character pairs to match
matchlist = []
matchlist.append(EMOJI_UNICODE[':pistol:'])
matchlist.append(EMOJI_UNICODE[':face_without_mouth:'] + EMOJI_UNICODE[':pistol:'])
matchlist.append(EMOJI_UNICODE[':upside-down_face:'] + EMOJI_UNICODE[':pistol:'])
matchlist.append(EMOJI_UNICODE[':police_officer:'] + EMOJI_UNICODE[':pistol:'])
matchlist.append(EMOJI_UNICODE[':pistol:'] + EMOJI_UNICODE[':police_officer:'])
matchlist.append(EMOJI_UNICODE[':kitchen_knife:'] + EMOJI_UNICODE[':pistol:'] + EMOJI_UNICODE[':bomb:'])
matchlist.append(EMOJI_UNICODE[':rooster:'] + EMOJI_UNICODE[':pistol:'])
matchlist.append(EMOJI_UNICODE[':collision:'] + EMOJI_UNICODE[':pistol:'])
matchlist.append(EMOJI_UNICODE[':broken_heart:'] + EMOJI_UNICODE[':pistol:'])
In [5]:
matchlist
Out[5]:
In [6]:
# Counters
counter_total_tweets = 0
counter_examples = {m:0 for m in matchlist}
max_examples = 200
In [ ]:
# Main search loop
start_t = timer()
for day in range(0, 31):
day_str = "{:02d}".format(day + 1)
for hour in range(0, 24):
hour_str = "{:02d}".format(hour)
file_path = os.path.join(data_path, day_str, hour_str)
files = os.listdir(file_path)
for i, f in enumerate(files):
fbz = bz2.BZ2File(os.path.join(file_path, f), 'rb')
try:
fdec = fbz.read()
except IOError:
continue
finally:
fbz.close()
fdecutf = fdec.decode('utf-8')
for t in fdecutf.split('\n'):
try:
tweet = json.loads(t)
except (ValueError):
continue
if 'delete' in tweet.keys():
continue
if 'lang' not in tweet.keys():
continue
counter_total_tweets += 1
for match in matchlist:
if (match in tweet['text']) and \
(tweet['lang'] == 'en') and \
(counter_examples[match] < max_examples):
counter_examples[match] += 1
with open('./data/examples_{}.txt'.format(match), 'a') as outfile:
outfile.write(str(counter_examples[match]))
outfile.write('\n')
outfile.write(tweet['text'])
outfile.write('\n\n')
if sum(list(counter_examples.values())) >= max_examples * len(matchlist):
break
else:
continue
break
else:
continue
break
else:
continue
break
end_t = timer()
In [ ]:
# Print outputs
print("Elapsed Time : {:.2f} min".format((end_t - start_t) / 60))
print("Total Tweets : {:d}".format(counter_total_tweets))
print(counter_examples)
In [ ]: