Beer Recommender


In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

# colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 400
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
# rcParams['font.family'] = 'StixGeneral'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    # turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    # now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

def autolabel(rects, height_offset, fontsize):
    """Label rects with their height"""
    for rect in rects:
        height = rect.get_height()
        plt.text(rect.get_x() + rect.get_width() / 2.0,
                 height + height_offset,
                 '%d' % int(height),
                 ha='center',
                 va='bottom',
                 rotation='vertical',
                 fontsize=fontsize)

In [2]:
# read in review data
df = pd.read_csv('reviews_data.csv')

In [3]:
# read in a subset of the data
# df2 = pd.read_csv('reviews_scrubbed/scrubbed_reviews_00.csv')

In [4]:
def get_reviews_by(df, column_name):
    num_reviews_by_item = {}
    for value, indices in df.groupby(column_name).groups.iteritems():
        num_reviews_by_item[value] = len(indices)
    return num_reviews_by_item

# read in beer data
beer_df = pd.read_csv('beer_data.csv')

# filter out aliases, and also beers with no ratings or reviews
reviewed_beers = beer_df[pd.isnull(beer_df['alias_id']) & (beer_df['num_ratings'] > 0)]

unreviewed_beers = beer_df[pd.isnull(beer_df['alias_id']) & (beer_df['num_ratings'] == 0)]

# get beers that are just aliases of other beers
aliases = beer_df[pd.notnull(beer_df['alias_id'])]

# get beer styles
styles = reviewed_beers['style'].unique()

print 'Total number of beers:              %s' % len(beer_df)
print 'Number of rated/reviewed beers:     %s (%s%%)' % (len(reviewed_beers), float(len(reviewed_beers)) / len(beer_df) * 100.0)
print 'Number of unrated/unreviewed beers: %s (%s%%)' % (len(unreviewed_beers), float(len(unreviewed_beers)) / len(beer_df) * 100.0)
print 'Number of aliases:                  %s (%s%%)' % (len(aliases), float(len(aliases)) / len(beer_df) * 100.0)
print
print 'Number of beer styles: %s' % len(styles)

# get the number of reviews for each style, and sort them
num_reviews_by_style = get_reviews_by(reviewed_beers, 'style')
sorted_num = sorted([(k, v) for k, v in num_reviews_by_style.iteritems()], key=lambda x: x[1], reverse=True)

# construct x (evenly spaced coords), y (num reviews), and label (style name) arrays for plotting
num_bars = len(sorted_num)
x = np.array(range(num_bars))
y = [count for _, count in reversed(sorted_num)]
labels = [unicode(style, 'utf_8') for style, _ in reversed(sorted_num)]

# plot review count by style
fig = plt.figure(figsize=(20, 15))
fig.subplots_adjust(bottom=0.16)
ax = fig.add_subplot(1, 1, 1)
rects = ax.bar(x, y, align='center', width=.6)
ax.set_xlim([-1, num_bars])
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation='vertical', fontsize=8)
ax.set_xlabel('Style')
ax.set_ylabel('Reviews')
ax.set_title('Number of Reviews by Style')

# put count above each bar
autolabel(rects, 40, 7)

# fig.savefig('reviews_by_style.png', dpi=1000)


Total number of beers:              93063
Number of rated/reviewed beers:     87191 (93.6902958211%)
Number of unrated/unreviewed beers: 5567 (5.98196920366%)
Number of aliases:                  305 (0.327734975232%)

Number of beer styles: 104

In [7]:
def display_stats_by(df, column_name, description, max_val_lookup_df, max_val_lookup_key):
    num_reviews_by_item = get_reviews_by(df, column_name)
    num_reviews = np.array(num_reviews_by_item.values())
    
    # get the max number of reviews, and look up the human-readable name if required
    max_name, max_num = max(num_reviews_by_item.iteritems(), key=lambda x: x[1])
    if max_val_lookup_df:
        max_name = max_val_lookup_df[max_val_lookup_df[column_name] == max_name].iloc[0][max_val_lookup_key]
    
    print 'Reviews per %s stats:' % description.lower()
    print '    Mean:   %s' % num_reviews.mean()
    print '    Median: %s' % int(np.median(num_reviews))
    print '    Mode:   %s (%s occurrences)' % tuple([int(x[0]) for x in stats.mode(num_reviews)])
    print '    Min:    %s' % num_reviews.min()
    print "    Max:    %s (%s)" % (max_num, max_name)

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.hist(num_reviews, bins=25, log=True)
    ax.set_title('Number of Reviews per %s' % description.title())
    ax.set_ylabel('Occurrences')
    ax.set_xlabel('Number of Reviews')

def display_stats(df):
    total_num_reviews = len(df)
    num_text_reviews = len(df[pd.notnull(df['text'])])
    num_nontext_reviews = len(df[pd.isnull(df['text'])])

    print 'Total reviews:    %s' % total_num_reviews
    print 'Text reviews:     %s (%s%%)' % (num_text_reviews, float(num_text_reviews) / total_num_reviews * 100.0)
    print 'Non-text reviews: %s (%s%%)' % (num_nontext_reviews, float(num_nontext_reviews) / total_num_reviews * 100.0)

    print
    print 'Number of users:     %s' % len(df['username'].unique())
    print 'Number of beers:     %s' % len(df['beer_id'].unique())
    print 'Number of breweries: %s' % len(df['brewery_id'].unique())

    print
    display_stats_by(df, 'username', 'User', None, None)
    print
    display_stats_by(df, 'beer_id', 'Beer', reviewed_beers, 'beer_name')
    print
    display_stats_by(df, 'brewery_id', 'Brewery', reviewed_beers, 'brewery_name')

display_stats(df)


Total reviews:    3701747
Text reviews:     1863353 (50.3371246063%)
Non-text reviews: 1838394 (49.6628753937%)

Number of users:     62782
Number of beers:     87100
Number of breweries: 6667

Reviews per user stats:
    Mean:   58.942514455
    Median: 6
    Mode:   1 (14581 occurrences)
    Min:    1
    Max:    7725 (t0rin0)

Reviews per beer stats:
    Mean:   42.4999655568
    Median: 4
    Mode:   1 (23230 occurrences)
    Min:    1
    Max:    9078 (90 Minute IPA)

Reviews per brewery stats:
    Mean:   555.234288286
    Median: 19
    Mode:   1 (684 occurrences)
    Min:    1
    Max:    93383 (Boston Beer Company (Samuel Adams))

In [8]:
beers = ['Harpoon IPA', 'Sierra Nevada Pale Ale', 'Flipside Red IPA', 'Harpoon Leviathan - Imperial IPA']

for b in beers:
    beer_id = beer_df[beer_df['beer_name'] == b].iloc[0]['beer_id']
    print b, len(df[df['beer_id'] == beer_id])

print len(df[df['beer_id'] == 788])


Harpoon IPA 2622
Sierra Nevada Pale Ale 6549
Flipside Red IPA 1149
Harpoon Leviathan - Imperial IPA 1234
718

In [9]:
reviewed_beers_ids = set(reviewed_beers['beer_id'].unique())
beer_ids = set(df['beer_id'].unique())

print 'Num in beers dataset:', len(reviewed_beers_ids)
print 'Num in reviews dataset:', len(beer_ids)

missed_beers = reviewed_beers_ids - beer_ids
print 'In beers dataset, not in reviews dataset:', len(missed_beers)

print 'In reviews dataset, not in beers dataset:', len(beer_ids - reviewed_beers_ids)
print beer_ids - reviewed_beers_ids

# print df[df['beer_id'] == 96996].iloc[0]


Num in beers dataset: 87191
Num in reviews dataset: 87100
In beers dataset, not in reviews dataset: 100
In reviews dataset, not in beers dataset: 9
set([96996, 96998, 92658, 92464, 92466, 103128, 103130, 97020, 95677])

In [ ]: