In [1]:
import pandas as pd
import numpy as np
import cPickle as pickle
import matplotlib.pyplot as plt
% matplotlib inline
In [2]:
filepath = '/Users/Gevurtz/galvanize/beer_rec_project/data/aggregated.pkl'
with open(filepath, 'r') as f:
data = pickle.load(f)
Converting json to pandas dataframe
In [3]:
df = pd.io.json.json_normalize(data)
In [4]:
data = None
In [5]:
# drop retired beers
df = df[df.breweryname != 'none']
In [6]:
df.info()
In [6]:
df.head()
Out[6]:
In [7]:
def print_summary(df):
n_beers = len(df.beerid.unique())
n_breweries = len(df.breweryname.unique())
n_styles = len(df.style.unique())
n_users = len(df.userid.unique())
n_reviews = len(df)
print 'users: ',n_users
print 'reviews: ',n_reviews
print 'beers: ', n_beers
print 'breweries:', n_breweries
print 'styles: ', n_styles
In [8]:
print_summary(df)
In [ ]:
# are usernames a unique identifier?
len(df.username.unique())
In [7]:
import graphlab as gl
In [7]:
df.columns
Out[7]:
In [ ]:
df['userid'] = df['userid'].map(lambda x: int(x))
df['username'] = df['username'].map(lambda x: str(x))
#df['brewery_name'] = df['brewery_name'].map(lambda x: str(x))
In [9]:
SFdata = gl.SFrame(df[[u'beerid',
u'beername',
u'breweryname',
u'overall',
u'userid',
u'username',
u'style']])
In [10]:
train, test = gl.recommender.util.random_split_by_user(SFdata,
user_id='userid',
item_id='beerid',
max_num_users=10000)
In [15]:
pop_model = gl.popularity_recommender.create(train, 'username', 'beerid', 'overall')
In [18]:
baseline_rmse = gl.evaluation.rmse(test['overall'], pop_model.predict(test))
print 'Baseline RMSE: ',baseline_rmse
In [16]:
obs_data_train = train['appearance',
'aroma',
'beerid',
'overall',
'palate',
'taste',
'username']
obs_data_test = test['appearance',
'aroma',
'beerid',
'overall',
'palate',
'taste',
'username']
In [22]:
obs_data_test
Out[22]:
In [ ]:
In [17]:
product_data_train = train['beerid', 'style','breweryname']
product_data_test = test['beerid', 'style','breweryname']
In [18]:
regularization_vals = [.01, .001, .0001]
models = [gl.factorization_recommender.create(obs_data_train,
user_id="username",
item_id="beerid",
target="overall",
item_data=product_data_train,
max_iterations=50,
num_factors=5,
regularization=r)
for r in regularization_vals]
In [23]:
(rmse_train, rmse_test) = ([], [])
for m in models:
rmse_train.append(m['training_rmse'])
rmse_test.append(gl.evaluation.rmse(obs_data_test['overall'], m.predict(obs_data_test,
new_item_data=product_data_test)))
In [24]:
new_test = test['overall', 'username', 'beerid']
(rmse_train, rmse_test) = ([], [])
for m in models:
rmse_train.append(m['training_rmse'])
rmse_test.append(gl.evaluation.rmse(obs_data_test['overall'], m.predict(obs_data_test,
new_item_data=product_data_test)))
In [25]:
(fig, ax) = plt.subplots(figsize=(10, 8))
[p1, p2, p3] = ax.semilogx(regularization_vals, rmse_train,
regularization_vals, rmse_test,
regularization_vals, len(regularization_vals) * [baseline_rmse]
)
#ax.set_ylim([0.45, .7])
ax.set_xlabel('Regularization', fontsize=20)
ax.set_ylabel('RMSE', fontsize=20)
ax.legend([p1, p2, p3], ["Train", "Test", "Baseline"])
plt.show()
In [11]:
obs_data_train = train['beerid',
'overall',
'username']
obs_data_test = test['beerid',
'overall',
'username']
In [12]:
product_data_train = train['beerid', 'style','breweryname']
product_data_test = test['beerid', 'style','breweryname']
In [ ]:
gl.ranking_factorization_recommender.create()
In [28]:
regularization_vals = [.001, .0001, .00001]
new_models = [gl.factorization_recommender.create(obs_data_train,
user_id="username",
item_id="beerid",
target="overall",
item_data=product_data_train,
max_iterations=50,
num_factors=5,
regularization=r)
for r in regularization_vals]
In [ ]:
(rmse_train, rmse_test) = ([], [])
for m in new_models:
rmse_train.append(m['training_rmse'])
rmse_test.append(gl.evaluation.rmse(obs_data_test['overall'], m.predict(obs_data_test,
new_item_data=product_data_test)))
In [ ]:
(fig, ax) = plt.subplots(figsize=(10, 8))
[p1, p2, p3] = ax.semilogx(regularization_vals, rmse_train,
regularization_vals, rmse_test,
regularization_vals, len(regularization_vals) * [baseline_rmse]
)
#ax.set_ylim([0.45, .7])
ax.set_xlabel('Regularization', fontsize=20)
ax.set_ylabel('RMSE', fontsize=20)
ax.legend([p1, p2, p3], ["Train", "Test", "Baseline"])
plt.show()
In [21]:
obs_data_train = train['beerid',
'overall',
'userid']
obs_data_test = test['beerid',
'overall',
'userid']
In [22]:
product_data_train = train['beerid', 'style','breweryname']
product_data_test = test['beerid', 'style','breweryname']
In [21]:
model = gl.recommender.ranking_factorization_recommender.create(obs_data_train,
user_id='userid',
item_id='beerid',
target='overall',
item_data=product_data_train)
In [32]:
model.recommend(users=[409311])
Out[32]:
In [35]:
SFdata[SFdata['beerid'] == 7286]
Out[35]:
In [43]:
model.get_similar_items(items=[7286, 24542])
Out[43]:
In [42]:
SFdata[SFdata['username'] == 'monty_pilsner']
Out[42]:
In [44]:
model.save('/Users/Gevurtz/galvanize/beer_rec_project/beer-recommender/webapp/rankmodel')
In [45]:
l_model = gl.load_model('/Users/Gevurtz/galvanize/beer_rec_project/beer-recommender/webapp/rankmodel')
In [29]:
similarity_model_cos = gl.recommender.item_similarity_recommender.create(obs_data_train,
user_id='userid',
item_id='beerid',
target='overall',
similarity_type='cosine')
In [25]:
similarity_model.get_similar_items([24542])
Out[25]:
In [26]:
similarity_model.get_current_options()
Out[26]:
In [31]:
similarity_model_cos.get_similar_items([24542])
Out[31]:
In [ ]:
similarity_model.save('/Users/Gevurtz/galvanize/beer_rec_project/beer-recommender/webapp/similarity')