In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl
# colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 400
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
# rcParams['font.family'] = 'StixGeneral'
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
# turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
# now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
def autolabel(rects, height_offset, fontsize):
"""Label rects with their height"""
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2.0,
height + height_offset,
'%d' % int(height),
ha='center',
va='bottom',
rotation='vertical',
fontsize=fontsize)
In [2]:
# read in review data
df = pd.read_csv('reviews_data.csv')
In [3]:
# read in a subset of the data
# df2 = pd.read_csv('reviews_scrubbed/scrubbed_reviews_00.csv')
In [4]:
def get_reviews_by(df, column_name):
num_reviews_by_item = {}
for value, indices in df.groupby(column_name).groups.iteritems():
num_reviews_by_item[value] = len(indices)
return num_reviews_by_item
# read in beer data
beer_df = pd.read_csv('beer_data.csv')
# filter out aliases, and also beers with no ratings or reviews
reviewed_beers = beer_df[pd.isnull(beer_df['alias_id']) & (beer_df['num_ratings'] > 0)]
unreviewed_beers = beer_df[pd.isnull(beer_df['alias_id']) & (beer_df['num_ratings'] == 0)]
# get beers that are just aliases of other beers
aliases = beer_df[pd.notnull(beer_df['alias_id'])]
# get beer styles
styles = reviewed_beers['style'].unique()
print 'Total number of beers: %s' % len(beer_df)
print 'Number of rated/reviewed beers: %s (%s%%)' % (len(reviewed_beers), float(len(reviewed_beers)) / len(beer_df) * 100.0)
print 'Number of unrated/unreviewed beers: %s (%s%%)' % (len(unreviewed_beers), float(len(unreviewed_beers)) / len(beer_df) * 100.0)
print 'Number of aliases: %s (%s%%)' % (len(aliases), float(len(aliases)) / len(beer_df) * 100.0)
print
print 'Number of beer styles: %s' % len(styles)
# get the number of reviews for each style, and sort them
num_reviews_by_style = get_reviews_by(reviewed_beers, 'style')
sorted_num = sorted([(k, v) for k, v in num_reviews_by_style.iteritems()], key=lambda x: x[1], reverse=True)
# construct x (evenly spaced coords), y (num reviews), and label (style name) arrays for plotting
num_bars = len(sorted_num)
x = np.array(range(num_bars))
y = [count for _, count in reversed(sorted_num)]
labels = [unicode(style, 'utf_8') for style, _ in reversed(sorted_num)]
# plot review count by style
fig = plt.figure(figsize=(20, 15))
fig.subplots_adjust(bottom=0.16)
ax = fig.add_subplot(1, 1, 1)
rects = ax.bar(x, y, align='center', width=.6)
ax.set_xlim([-1, num_bars])
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation='vertical', fontsize=8)
ax.set_xlabel('Style')
ax.set_ylabel('Reviews')
ax.set_title('Number of Reviews by Style')
# put count above each bar
autolabel(rects, 40, 7)
# fig.savefig('reviews_by_style.png', dpi=1000)
In [7]:
def display_stats_by(df, column_name, description, max_val_lookup_df, max_val_lookup_key):
num_reviews_by_item = get_reviews_by(df, column_name)
num_reviews = np.array(num_reviews_by_item.values())
# get the max number of reviews, and look up the human-readable name if required
max_name, max_num = max(num_reviews_by_item.iteritems(), key=lambda x: x[1])
if max_val_lookup_df:
max_name = max_val_lookup_df[max_val_lookup_df[column_name] == max_name].iloc[0][max_val_lookup_key]
print 'Reviews per %s stats:' % description.lower()
print ' Mean: %s' % num_reviews.mean()
print ' Median: %s' % int(np.median(num_reviews))
print ' Mode: %s (%s occurrences)' % tuple([int(x[0]) for x in stats.mode(num_reviews)])
print ' Min: %s' % num_reviews.min()
print " Max: %s (%s)" % (max_num, max_name)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.hist(num_reviews, bins=25, log=True)
ax.set_title('Number of Reviews per %s' % description.title())
ax.set_ylabel('Occurrences')
ax.set_xlabel('Number of Reviews')
def display_stats(df):
total_num_reviews = len(df)
num_text_reviews = len(df[pd.notnull(df['text'])])
num_nontext_reviews = len(df[pd.isnull(df['text'])])
print 'Total reviews: %s' % total_num_reviews
print 'Text reviews: %s (%s%%)' % (num_text_reviews, float(num_text_reviews) / total_num_reviews * 100.0)
print 'Non-text reviews: %s (%s%%)' % (num_nontext_reviews, float(num_nontext_reviews) / total_num_reviews * 100.0)
print
print 'Number of users: %s' % len(df['username'].unique())
print 'Number of beers: %s' % len(df['beer_id'].unique())
print 'Number of breweries: %s' % len(df['brewery_id'].unique())
print
display_stats_by(df, 'username', 'User', None, None)
print
display_stats_by(df, 'beer_id', 'Beer', reviewed_beers, 'beer_name')
print
display_stats_by(df, 'brewery_id', 'Brewery', reviewed_beers, 'brewery_name')
display_stats(df)
In [8]:
beers = ['Harpoon IPA', 'Sierra Nevada Pale Ale', 'Flipside Red IPA', 'Harpoon Leviathan - Imperial IPA']
for b in beers:
beer_id = beer_df[beer_df['beer_name'] == b].iloc[0]['beer_id']
print b, len(df[df['beer_id'] == beer_id])
print len(df[df['beer_id'] == 788])
In [9]:
reviewed_beers_ids = set(reviewed_beers['beer_id'].unique())
beer_ids = set(df['beer_id'].unique())
print 'Num in beers dataset:', len(reviewed_beers_ids)
print 'Num in reviews dataset:', len(beer_ids)
missed_beers = reviewed_beers_ids - beer_ids
print 'In beers dataset, not in reviews dataset:', len(missed_beers)
print 'In reviews dataset, not in beers dataset:', len(beer_ids - reviewed_beers_ids)
print beer_ids - reviewed_beers_ids
# print df[df['beer_id'] == 96996].iloc[0]
In [ ]: