In [1]:
import os
import sys
import time
import json
import gzip
import string
from datetime import datetime, date
import time
import unicodedata
import calendar
PARSE_LIMIT = None # limits the total amount of data parsed during testing
DATA_PATH = os.path.join(os.getcwd(), 'data')
RATEBEER_FILENAME = os.path.join(DATA_PATH, 'Ratebeer.txt.gz')
BEERADVOCATE_FILENAME = os.path.join(DATA_PATH, 'Beeradvocate.txt.gz')
# OUT_FILENAME = os.path.join(DATA_PATH, 'mobile-data.txt')
OUT_FILENAME = os.path.join(DATA_PATH, 'reduced_data.txt')
SORTED_FILENAME = os.path.join(DATA_PATH, 'sorted_data.txt')
START_DATE = date(2008,5,1)
END_DATE = date(2010,5,1)
In [2]:
"""
Convert Fraction String to Float
"""
def convert_fraction_string(val_str):
parts = val_str.split('/')
try:
return float(int(parts[0])) / int(parts[1])
except (ZeroDivisionError, IndexError):
return 0.0
In [3]:
"""
Sanitize Beer Data
Basic Implementation Adapted from http://jmcauley.ucsd.edu/cse255/data/beer/processRaw.py
"""
def sanitize_ba(e):
try:
e['review/appearance'] = float(e['review/appearance'])
e['review/taste'] = float(e['review/taste'])
e['review/overall'] = float(e['review/overall'])
e['review/palate'] = float(e['review/palate'])
e['review/aroma'] = float(e['review/aroma'])
e['review/timeUnix'] = int(e['review/time'])
e.pop('review/time', None)
try:
e['beer/ABV'] = float(e['beer/ABV'])
except Exception as q:
e.pop('beer/ABV', None)
e['user/profileName'] = e['review/profileName']
e.pop('review/profileName', None)
timeStruct = time.gmtime(e['review/timeUnix'])
e['review/timeStruct'] = dict(zip(["year", "mon", "mday", "hour", "min", "sec", "wday", "yday", "isdst"], list(timeStruct)))
except Exception as q:
print q
pass
def sanitize_rb(e):
try:
e['review/appearance'] = convert_fraction_string(e['review/appearance'])
e['review/taste'] = convert_fraction_string(e['review/taste'])
e['review/overall'] = convert_fraction_string(e['review/overall'])
e['review/palate'] = convert_fraction_string(e['review/palate'])
e['review/aroma'] = convert_fraction_string(e['review/aroma'])
e['review/timeUnix'] = int(e['review/time'])
e.pop('review/time', None)
try:
e['beer/ABV'] = float(e['beer/ABV'])
except Exception as q:
e.pop('beer/ABV', None)
e['user/profileName'] = e['review/profileName']
e.pop('review/profileName', None)
timeStruct = time.gmtime(e['review/timeUnix'])
e['review/timeStruct'] = dict(zip(["year", "mon", "mday", "hour", "min", "sec", "wday", "yday", "isdst"], list(timeStruct)))
except Exception as q:
print q
pass
def sanitizer_switch(e, source):
if source == 'BA':
sanitize_ba(e)
elif source == 'RB':
sanitize_rb(e)
return e
In [4]:
# Initialize Beer Parsing of File when Called
def parse_beer(filename, source=None):
f = gzip.open(filename, 'r')
entry = {}
for l in f:
l = l.strip()
colonPos = l.find(':')
if colonPos == -1:
yield sanitizer_switch(entry, source)
entry = {}
continue
eName = l[:colonPos]
rest = l[colonPos+2:]
entry[eName] = rest
yield sanitizer_switch(entry, source)
In [5]:
"""
Helper function to find min/max date from a time period
"""
def min_max_dates(beer_iter, parse_limit=None):
count = 0
min_date, max_date = None , None
first_review, last_review = None, None
for i, review in enumerate(beer_iter):
if parse_limit is not None and i >= parse_limit:
break
if review and (min_date is None or review.get('review/timeUnix') < min_date):
first_review = review
min_date = review.get('review/timeUnix')
if review and (max_date is None or review.get('review/timeUnix') > max_date):
last_review = review
max_date = review.get('review/timeUnix')
return first_review, last_review, count
In [11]:
"""
Helper function to filter data based on provided min/max date
"""
def filter_data_date(start, end, beer_iter, filename):
filter_count = 0
total_count = 0
with open(filename, 'w') as f:
for i, review in enumerate(beer_iter):
total_count += 1
if review and (review.get('review/timeUnix') >= start) and (review.get('review/timeUnix') < end):
filter_count += 1
json.dump(review, f, ensure_ascii=False)
f.write('\n')
f.close()
return filter_count, total_count
In [12]:
"""
Helper to return unicode string as ascii with special characters removed.
"""
def clean_unicode(instr):
return unicodedata.normalize('NFKD', instr).encode('ascii', 'ignore')
In [13]:
# first, last = min_max_dates(data_iter, PARSE_LIMIT)
In [14]:
"""
Helper to parse and yield data from file to json
"""
def parse_json(filename, normalize=True):
with open(filename) as f:
for line in f:
out = json.loads(line, encoding='latin-1')
if normalize:
temp = {}
for key in out:
try:
key = clean_unicode(key)
except:
key = key
try:
temp[key] = clean_unicode(out[key])
except:
temp[key] = out[key]
out = temp
yield out
f.close()
In [17]:
# only run once - parses all data and writes reduced data to file
start = time.time()
data_iter = parse_beer(RATEBEER_FILENAME, 'RB')
start_timestamp = calendar.timegm(START_DATE.timetuple())
end_timestamp = calendar.timegm(END_DATE.timetuple())
filt_count, total_count = filter_data_date(start_timestamp, end_timestamp, data_iter, OUT_FILENAME)
print 'Finished writing parsed data to %s in %0.3fs' % (OUT_FILENAME, time.time() - start)
In [19]:
"""
Read the reviews data, and write a sorted copy.
"""
timestamp_keyfunc = lambda r: r.get('review/timeUnix')
reviews_iter = parse_json(OUT_FILENAME)
start = time.time()
with open(SORTED_FILENAME, 'w') as outfile:
for review in sorted(reviews_iter, key=timestamp_keyfunc):
review_date = date.fromtimestamp(review.get('review/timeUnix'))
if review_date >= START_DATE and review_date < END_DATE:
json.dump(review, outfile, ensure_ascii=True)
outfile.write('\n')
print 'Finished writing date sorted data to %s in %0.3fs' % (SORTED_FILENAME, time.time() - start)
sorted_reviews = None
In [ ]: