In [1]:
import os
import sys
import time
import json
import gzip
import string
from datetime import datetime, date
import time
import unicodedata
import calendar

PARSE_LIMIT = None  # limits the total amount of data parsed during testing

DATA_PATH = os.path.join(os.getcwd(), 'data')
RATEBEER_FILENAME = os.path.join(DATA_PATH, 'Ratebeer.txt.gz')
BEERADVOCATE_FILENAME = os.path.join(DATA_PATH, 'Beeradvocate.txt.gz')

# OUT_FILENAME = os.path.join(DATA_PATH, 'mobile-data.txt')
OUT_FILENAME = os.path.join(DATA_PATH, 'reduced_data.txt')
SORTED_FILENAME = os.path.join(DATA_PATH, 'sorted_data.txt')

START_DATE = date(2008,5,1)
END_DATE = date(2010,5,1)

In [2]:
"""
Convert Fraction String to Float
"""
def convert_fraction_string(val_str):
    parts = val_str.split('/')
    try:
        return float(int(parts[0])) / int(parts[1])
    except (ZeroDivisionError, IndexError):
        return 0.0

In [3]:
"""
Sanitize Beer Data
Basic Implementation Adapted from http://jmcauley.ucsd.edu/cse255/data/beer/processRaw.py
"""
def sanitize_ba(e):
    try:
        e['review/appearance'] = float(e['review/appearance'])
        e['review/taste'] = float(e['review/taste'])
        e['review/overall'] = float(e['review/overall'])
        e['review/palate'] = float(e['review/palate'])
        e['review/aroma'] = float(e['review/aroma'])
        e['review/timeUnix'] = int(e['review/time'])
        e.pop('review/time', None)
        try:
            e['beer/ABV'] = float(e['beer/ABV'])
        except Exception as q:
            e.pop('beer/ABV', None)
        e['user/profileName'] = e['review/profileName']
        e.pop('review/profileName', None)
        timeStruct = time.gmtime(e['review/timeUnix'])
        e['review/timeStruct'] = dict(zip(["year", "mon", "mday", "hour", "min", "sec", "wday", "yday", "isdst"], list(timeStruct)))
    except Exception as q:
        print q
        pass


def sanitize_rb(e):
    try:
        e['review/appearance'] = convert_fraction_string(e['review/appearance'])
        e['review/taste'] = convert_fraction_string(e['review/taste'])
        e['review/overall'] = convert_fraction_string(e['review/overall'])
        e['review/palate'] = convert_fraction_string(e['review/palate'])
        e['review/aroma'] = convert_fraction_string(e['review/aroma'])
        e['review/timeUnix'] = int(e['review/time'])
        e.pop('review/time', None)
        try:
            e['beer/ABV'] = float(e['beer/ABV'])
        except Exception as q:
            e.pop('beer/ABV', None)
        e['user/profileName'] = e['review/profileName']
        e.pop('review/profileName', None)
        timeStruct = time.gmtime(e['review/timeUnix'])
        e['review/timeStruct'] = dict(zip(["year", "mon", "mday", "hour", "min", "sec", "wday", "yday", "isdst"], list(timeStruct)))
    except Exception as q:
        print q
        pass
    
def sanitizer_switch(e, source):
    if source == 'BA':
        sanitize_ba(e)
    elif source == 'RB':
        sanitize_rb(e)
    return e

In [4]:
# Initialize Beer Parsing of File when Called
def parse_beer(filename, source=None):
    f = gzip.open(filename, 'r')
    entry = {}
    for l in f:
        l = l.strip()
        colonPos = l.find(':')
        if colonPos == -1:
            yield sanitizer_switch(entry, source)
            entry = {}
            continue
        eName = l[:colonPos]
        rest = l[colonPos+2:]
        entry[eName] = rest
    yield sanitizer_switch(entry, source)

In [5]:
"""
Helper function to find min/max date from a time period
"""
def min_max_dates(beer_iter, parse_limit=None):
    count = 0
    min_date, max_date = None , None 
    first_review, last_review = None, None
    
    for i, review in enumerate(beer_iter):
        if parse_limit is not None and i >= parse_limit:
            break
        if review and (min_date is None or review.get('review/timeUnix') < min_date):
            first_review = review
            min_date = review.get('review/timeUnix')
        if review and (max_date is None or review.get('review/timeUnix') > max_date):
            last_review = review
            max_date = review.get('review/timeUnix')
    
    return first_review, last_review, count

In [11]:
"""
Helper function to filter data based on provided min/max date
"""
def filter_data_date(start, end, beer_iter, filename):
    filter_count = 0
    total_count = 0
    with open(filename, 'w') as f:
        for i, review in enumerate(beer_iter):
            total_count += 1
            if review and (review.get('review/timeUnix') >= start) and (review.get('review/timeUnix') < end):
                filter_count += 1
                json.dump(review, f, ensure_ascii=False)
                f.write('\n')
    f.close()
    return filter_count, total_count

In [12]:
"""
Helper to return unicode string as ascii with special characters removed.
"""
def clean_unicode(instr):
    return unicodedata.normalize('NFKD', instr).encode('ascii', 'ignore')

In [13]:
# first, last = min_max_dates(data_iter, PARSE_LIMIT)

In [14]:
"""
Helper to parse and yield data from file to json
"""
def parse_json(filename, normalize=True):
    with open(filename) as f:
        for line in f:
            out = json.loads(line, encoding='latin-1')
            if normalize:
                temp = {}
                for key in out:
                    try:
                        key = clean_unicode(key)
                    except:
                        key = key
                    try:
                        temp[key] = clean_unicode(out[key])
                    except:
                        temp[key] = out[key]
                out = temp
            yield out
    f.close()

In [17]:
# only run once - parses all data and writes reduced data to file
start = time.time()
data_iter = parse_beer(RATEBEER_FILENAME, 'RB')
start_timestamp = calendar.timegm(START_DATE.timetuple())
end_timestamp = calendar.timegm(END_DATE.timetuple())
filt_count, total_count = filter_data_date(start_timestamp, end_timestamp, data_iter, OUT_FILENAME)
print 'Finished writing parsed data to %s in %0.3fs' % (OUT_FILENAME, time.time() - start)


'review/appearance'
Finished writing parsed data to /Users/sean/Sync/cornell/CM/data-analysis-CM/data/reduced_data.txt in 465.860s

In [19]:
"""
Read the reviews data, and write a sorted copy.
"""
timestamp_keyfunc = lambda r: r.get('review/timeUnix')
reviews_iter = parse_json(OUT_FILENAME)

start = time.time()
with open(SORTED_FILENAME, 'w') as outfile:
    for review in sorted(reviews_iter, key=timestamp_keyfunc):
        review_date = date.fromtimestamp(review.get('review/timeUnix'))
        if review_date >= START_DATE and review_date < END_DATE:
            json.dump(review, outfile, ensure_ascii=True)
            outfile.write('\n')
print 'Finished writing date sorted data to %s in %0.3fs' % (SORTED_FILENAME, time.time() - start)
sorted_reviews = None


Finished writing date sorted data to /Users/sean/Sync/cornell/CM/data-analysis-CM/data/sorted_data.txt in 557.085s

In [ ]: