In [13]:
import os
import json
from ast import literal_eval
from datetime import datetime
SOURCE_FILENAME = 'beer_50000.json'
DATA_PATH = os.path.join(os.getcwd(), 'data')
In [14]:
lines = []
beer_data = []
with open(os.path.join(DATA_PATH, SOURCE_FILENAME)) as infile:
lines = infile.readlines()
for line in lines:
beer_data.append(literal_eval(line))
In [15]:
def pretty_json(my_dict):
return json.dumps(
my_dict,
sort_keys=True,
indent=4
)
print pretty_json(beer_data[-1])
In [4]:
sample_beer_id = '20539'
results = [b for b in beer_data if b.get('beer/beerId') == sample_beer_id]
print 'Total reviews for beer/beerId %s: %d' % (sample_beer_id, len(results))
min_time = min([t.get('review/timeUnix') for t in results])
max_time = max([t.get('review/timeUnix') for t in results])
print 'First review date for beer/beerId %s: %s' % (sample_beer_id, datetime.fromtimestamp(int(min_time)))
print 'Last review date for beer/beerId %s: %s' % (sample_beer_id, datetime.fromtimestamp(int(max_time)))
In [5]:
def groupby_key(data, key_str):
key_map = {}
for datum in data:
key = datum.get(key_str)
key_map[key] = key_map.setdefault(key, 0) + 1
return key_map
In [6]:
print 'Total reviews:\t%s' % "{:,}".format(len(beer_data))
beers_grouped = groupby_key(beer_data, 'beer/beerId')
print 'Unique beers:\t%s' % "{:,}".format(len(beers_grouped.keys()))
brewers_grouped = groupby_key(beer_data, 'beer/brewerId')
print 'Unique brewers:\t%s' % "{:,}".format(len(brewers_grouped.keys()))
print
users_grouped = groupby_key(beer_data, 'user/profileName')
print 'Unique users:\t%s' % "{:,}".format(len(users_grouped.keys()))
print 'Top 10 reviewers'
sorted_users = sorted(users_grouped.items(), cmp=lambda u1, u2: cmp(u1[1], u2[1]), reverse=True)
for i in range(10):
print '\t#%2d: %-20s%d' % (i+1, sorted_users[i][0], sorted_users[i][1])
one_review_user_count = sum([1 for u in sorted_users if u[1] == 1])
print '1 review users:\t%s\t%0.2f%%' % ("{:,}".format(one_review_user_count), float(one_review_user_count) / len(users_grouped.keys()) * 100)
print
print 'Avg. rating:\t%0.2f' % ( sum([float(r.get('review/overall')) for r in beer_data]) / len(beer_data) )
print 'Rating distribution:'
reviews_grouped = groupby_key(beer_data, 'review/overall')
for score in sorted([score for score in reviews_grouped.keys()]):
count = reviews_grouped[score]
print '\t%s - %-8s %0.2f%%' % (score, "{:,}".format(count), float(count) / len(beer_data) * 100)
In [ ]: