In [1]:

    
import pandas as pd
import numpy as np
import cPickle as pickle
import matplotlib.pyplot as plt
% matplotlib inline

Loading data



In [2]:

    
filepath = '/Users/Gevurtz/galvanize/beer_rec_project/data/aggregated.pkl'
with open(filepath, 'r') as f:
    data = pickle.load(f)

Converting json to pandas dataframe



In [3]:

    
df = pd.io.json.json_normalize(data)



In [4]:

    
data = None



In [5]:

    
# drop retired beers
df = df[df.breweryname != 'none']

Dataframe info



In [6]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 2520704 entries, 0 to 2807668
Data columns (total 12 columns):
appearance     int64
aroma          int64
beerid         int64
beername       object
breweryname    object
overall        int64
palate         int64
style          object
taste          int64
text           object
userid         object
username       object
dtypes: int64(6), object(6)
memory usage: 250.0+ MB



In [6]:

    
df.head()









    Out[6]:






  
    
      
      appearance
      aroma
      beerid
      beername
      breweryname
      overall
      palate
      style
      taste
      text
      userid
      username
    
  
  
    
      0
      3
      6
      256594
      Singin River Handys Gold
      Singin River Brewing Company
      13
      4
      Golden Ale/Blond Ale
      6
      pours a medium golden yellowish color aromas o...
      28586
      oldrtybastrd
    
    
      1
      4
      6
      256594
      Singin River Handys Gold
      Singin River Brewing Company
      12
      3
      Golden Ale/Blond Ale
      6
      can at home pours clear gold with a large pill...
      9357
      BeerandBlues2
    
    
      2
      4
      6
      256594
      Singin River Handys Gold
      Singin River Brewing Company
      9
      4
      Golden Ale/Blond Ale
      6
      can slightly hazed goldsmall puffy white head ...
      151324
      Patrickctenchi
    
    
      3
      5
      7
      256594
      Singin River Handys Gold
      Singin River Brewing Company
      10
      2
      Golden Ale/Blond Ale
      5
      can orange peach biscuit malt and straw aroma ...
      10924
      shrubber85
    
    
      4
      4
      7
      256594
      Singin River Handys Gold
      Singin River Brewing Company
      15
      4
      Golden Ale/Blond Ale
      7
      poured from a 12 oz can into a pint glass the ...
      327076
      ordybill

Summary statistics



In [7]:

    
def print_summary(df):
    n_beers = len(df.beerid.unique())
    n_breweries = len(df.breweryname.unique())
    n_styles = len(df.style.unique())
    n_users = len(df.userid.unique())
    n_reviews = len(df)

    print 'users:    ',n_users
    print 'reviews:  ',n_reviews
    print 'beers:    ', n_beers
    print 'breweries:', n_breweries
    print 'styles:   ', n_styles



In [8]:

    
print_summary(df)









    



users:     40579
reviews:   2520704
beers:     28588
breweries: 2491
styles:    79



In [ ]:

    
# are usernames a unique identifier?
len(df.username.unique())

GraphLab modeling



In [7]:

    
import graphlab as gl



In [7]:

    
df.columns









    Out[7]:





Index([u'appearance', u'aroma', u'beerid', u'beername', u'breweryname',
       u'overall', u'palate', u'style', u'taste', u'text', u'userid',
       u'username'],
      dtype='object')



In [ ]:

    
df['userid'] = df['userid'].map(lambda x: int(x))
df['username'] = df['username'].map(lambda x: str(x))
#df['brewery_name'] = df['brewery_name'].map(lambda x: str(x))



In [9]:

    
SFdata = gl.SFrame(df[[u'beerid',
                       u'beername',
                       u'breweryname',
                       u'overall',
                       u'userid',
                       u'username',
                       u'style']])









    



[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1478028604.log
INFO:graphlab.cython.cy_server:GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1478028604.log






    



This non-commercial license of GraphLab Create for academic use is assigned to gevurtz+galavize@gmail.com and will expire on October 17, 2017.



In [10]:

    
train, test = gl.recommender.util.random_split_by_user(SFdata,
                                                       user_id='userid',
                                                       item_id='beerid',
                                                       max_num_users=10000)



In [15]:

    
pop_model = gl.popularity_recommender.create(train, 'username', 'beerid', 'overall')









    




Recsys training: model = popularity






    




Warning: Ignoring columns beername, breweryname, userid, style;






    




    To use these columns in scoring predictions, use a model that allows the use of additional features.






    




Preparing data set.






    




    Data has 2401680 observations with 39805 users and 28588 items.






    




    Data prepared in: 3.0926s






    




2401680 observations to process; with 28588 unique items.



In [18]:

    
baseline_rmse = gl.evaluation.rmse(test['overall'], pop_model.predict(test))
print 'Baseline RMSE: ',baseline_rmse









    



Baseline RMSE:  2.2379642947

Matrix Factorization Model with Side Features and Sub-reviews



In [16]:

    
obs_data_train = train['appearance',
                       'aroma',
                       'beerid',
                       'overall',
                       'palate',
                       'taste',
                      'username']
obs_data_test = test['appearance',
                       'aroma',
                       'beerid',
                       'overall',
                       'palate',
                       'taste',
                     'username']



In [22]:

    
obs_data_test









    Out[22]:





    
        appearance
        aroma
        beerid
        overall
        palate
        taste
        username
    
    
        4
        6
        256594
        13
        3
        6
        Itzjerm
    
    
        5
        5
        256594
        15
        4
        7
        Aggiebob
    
    
        3
        6
        256598
        13
        3
        7
        LiMuBai
    
    
        4
        5
        256598
        13
        4
        6
        Aggiebob
    
    
        5
        6
        403526
        14
        5
        6
        Patrickctenchi
    
    
        5
        7
        403526
        13
        5
        6
        Aggiebob
    
    
        3
        6
        403526
        12
        3
        6
        sonnycheeba
    
    
        3
        6
        375884
        14
        3
        6
        crabbypantalones
    
    
        3
        6
        375884
        12
        3
        6
        kerenmk
    
    
        4
        7
        122302
        14
        3
        7
        TXBadger
    

[504574 rows x 7 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [ ]:



In [17]:

    
product_data_train = train['beerid', 'style','breweryname']
product_data_test = test['beerid', 'style','breweryname']



In [18]:

    
regularization_vals = [.01, .001, .0001]

models = [gl.factorization_recommender.create(obs_data_train,
                                user_id="username",
                                item_id="beerid",
                                target="overall",
                                item_data=product_data_train,
                                max_iterations=50,
                                num_factors=5,
                                regularization=r)
                                              for r in regularization_vals]









    




Recsys training: model = factorization_recommender






    




Preparing data set.






    




    Data has 2016130 observations with 37440 users and 28588 items.






    




    Data prepared in: 6.04583s






    




Training factorization_recommender for recommendations.






    




+--------------------------------+--------------------------------------------------+----------+






    




| Parameter                      | Description                                      | Value    |






    




+--------------------------------+--------------------------------------------------+----------+






    




| num_factors                    | Factor Dimension                                 | 5        |






    




| regularization                 | L2 Regularization on Factors                     | 0.01     |






    




| solver                         | Solver used for training                         | adagrad  |






    




| linear_regularization          | L2 Regularization on Linear Coefficients         | 1e-10    |






    




| side_data_factorization        | Assign Factors for Side Data                     | True     |






    




| max_iterations                 | Maximum Number of Iterations                     | 50       |






    




+--------------------------------+--------------------------------------------------+----------+






    




  Optimizing model using SGD; tuning step size.






    




  Using 252016 / 2016130 points for tuning the step size.






    




+---------+-------------------+------------------------------------------+






    




| Attempt | Initial Step Size | Estimated Objective Value                |






    




+---------+-------------------+------------------------------------------+






    




| 0       | 0.00365459        | 2.35444                                  |






    




| 1       | 0.0018273         | 3.14429                                  |






    




| 2       | 0.000913649       | No Decrease (24.4922 >= 9.69869)         |






    




| 3       | 0.000228412       | No Decrease (11.5737 >= 9.69869)         |






    




+---------+-------------------+------------------------------------------+






    




| Final   | 0.00365459        | 2.35444                                  |






    




+---------+-------------------+------------------------------------------+






    




Starting Optimization.






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Iter.   | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size   |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Initial | 146us        | 9.70401           | 3.11512               |             |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| 1       | 2.06s        | 2.66703           | 1.6325                | 0.00365459  |






    




| 2       | 4.05s        | 2.23601           | 1.49533               | 0.00365459  |






    




| 3       | 6.01s        | 2.16851           | 1.47259               | 0.00365459  |






    




| 4       | 8.01s        | 2.12641           | 1.45822               | 0.00365459  |






    




| 5       | 10.01s       | 2.09503           | 1.44742               | 0.00365459  |






    




| 6       | 12.02s       | 2.07055           | 1.43894               | 0.00365459  |






    




| 7       | 13.98s       | 2.05037           | 1.43191               | 0.00365459  |






    




| 8       | 15.96s       | 2.03276           | 1.42575               | 0.00365459  |






    




| 9       | 18.37s       | 2.01717           | 1.42027               | 0.00365459  |






    




| 10      | 21.01s       | 2.00315           | 1.41533               | 0.00365459  |






    




| 11      | 23.61s       | 1.99058           | 1.41088               | 0.00365459  |






    




| 12      | 27.14s       | 1.979             | 1.40677               | 0.00365459  |






    




| 13      | 29.64s       | 1.96844           | 1.40301               | 0.00365459  |






    




| 14      | 32.09s       | 1.95853           | 1.39948               | 0.00365459  |






    




| 15      | 34.16s       | 1.94949           | 1.39624               | 0.00365459  |






    




| 16      | 36.19s       | 1.94099           | 1.39319               | 0.00365459  |






    




| 17      | 38.18s       | 1.93284           | 1.39027               | 0.00365459  |






    




| 18      | 40.18s       | 1.92561           | 1.38766               | 0.00365459  |






    




| 19      | 42.18s       | 1.91855           | 1.38512               | 0.00365459  |






    




| 20      | 44.16s       | 1.91176           | 1.38266               | 0.00365459  |






    




| 21      | 46.19s       | 1.90564           | 1.38045               | 0.00365459  |






    




| 22      | 48.21s       | 1.89965           | 1.37828               | 0.00365459  |






    




| 23      | 50.18s       | 1.89391           | 1.37619               | 0.00365459  |






    




| 24      | 52.18s       | 1.88857           | 1.37425               | 0.00365459  |






    




| 25      | 54.18s       | 1.88337           | 1.37236               | 0.00365459  |






    




| 26      | 56.17s       | 1.87848           | 1.37058               | 0.00365459  |






    




| 27      | 58.16s       | 1.8739            | 1.36891               | 0.00365459  |






    




| 28      | 1m 0s        | 1.86932           | 1.36723               | 0.00365459  |






    




| 29      | 1m 2s        | 1.86516           | 1.36571               | 0.00365459  |






    




| 30      | 1m 4s        | 1.86103           | 1.3642                | 0.00365459  |






    




| 31      | 1m 6s        | 1.85707           | 1.36274               | 0.00365459  |






    




| 32      | 1m 8s        | 1.85329           | 1.36136               | 0.00365459  |






    




| 33      | 1m 11s       | 1.84946           | 1.35995               | 0.00365459  |






    




| 34      | 1m 14s       | 1.84612           | 1.35872               | 0.00365459  |






    




| 35      | 1m 16s       | 1.84279           | 1.35749               | 0.00365459  |






    




| 36      | 1m 19s       | 1.83945           | 1.35626               | 0.00365459  |






    




| 37      | 1m 22s       | 1.83631           | 1.3551                | 0.00365459  |






    




| 38      | 1m 24s       | 1.83335           | 1.35401               | 0.00365459  |






    




| 39      | 1m 25s       | 1.83036           | 1.35291               | 0.00365459  |






    




| 40      | 1m 27s       | 1.82763           | 1.3519                | 0.00365459  |






    




| 41      | 1m 29s       | 1.82488           | 1.35088               | 0.00365459  |






    




| 42      | 1m 31s       | 1.82224           | 1.3499                | 0.00365459  |






    




| 43      | 1m 33s       | 1.81959           | 1.34892               | 0.00365459  |






    




| 44      | 1m 35s       | 1.81715           | 1.34802               | 0.00365459  |






    




| 45      | 1m 37s       | 1.81476           | 1.34713               | 0.00365459  |






    




| 46      | 1m 40s       | 1.81241           | 1.34626               | 0.00365459  |






    




| 47      | 1m 42s       | 1.81018           | 1.34543               | 0.00365459  |






    




| 48      | 1m 45s       | 1.80799           | 1.34461               | 0.00365459  |






    




| 49      | 1m 48s       | 1.8057            | 1.34376               | 0.00365459  |






    




| 50      | 1m 51s       | 1.80376           | 1.34304               | 0.00365459  |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




Optimization Complete: Maximum number of passes through the data reached.






    




Computing final objective value and training RMSE.






    




       Final objective value: 1.8027






    




       Final training RMSE: 1.34265






    




Recsys training: model = factorization_recommender






    




Preparing data set.






    




    Data has 2016130 observations with 37440 users and 28588 items.






    




    Data prepared in: 6.32115s






    




Training factorization_recommender for recommendations.






    




+--------------------------------+--------------------------------------------------+----------+






    




| Parameter                      | Description                                      | Value    |






    




+--------------------------------+--------------------------------------------------+----------+






    




| num_factors                    | Factor Dimension                                 | 5        |






    




| regularization                 | L2 Regularization on Factors                     | 0.001    |






    




| solver                         | Solver used for training                         | adagrad  |






    




| linear_regularization          | L2 Regularization on Linear Coefficients         | 1e-10    |






    




| side_data_factorization        | Assign Factors for Side Data                     | True     |






    




| max_iterations                 | Maximum Number of Iterations                     | 50       |






    




+--------------------------------+--------------------------------------------------+----------+






    




  Optimizing model using SGD; tuning step size.






    




  Using 252016 / 2016130 points for tuning the step size.






    




+---------+-------------------+------------------------------------------+






    




| Attempt | Initial Step Size | Estimated Objective Value                |






    




+---------+-------------------+------------------------------------------+






    




| 0       | 0.0365459         | Not Viable                               |






    




| 1       | 0.00913649        | 2.52055                                  |






    




| 2       | 0.00456824        | Not Viable                               |






    




| 3       | 0.00114206        | Not Viable                               |






    




| 4       | 0.000285515       | No Decrease (19.1178 >= 9.75602)         |






    




+---------+-------------------+------------------------------------------+






    




| Final   | 0.00913649        | 2.52055                                  |






    




+---------+-------------------+------------------------------------------+






    




Starting Optimization.






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Iter.   | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size   |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Initial | 123us        | 9.70368           | 3.11507               |             |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| 1       | 2.55s        | 6.43908           | 2.53671               | 0.00913649  |






    




| 2       | 5.26s        | 2.09953           | 1.4477                | 0.00913649  |






    




| 3       | 7.32s        | 2.0086            | 1.4161                | 0.00913649  |






    




| 4       | 9.31s        | 1.95714           | 1.39794               | 0.00913649  |






    




| 5       | 11.31s       | 1.92265           | 1.38566               | 0.00913649  |






    




| 6       | 13.30s       | 1.89655           | 1.3763                | 0.00913649  |






    




| 7       | 15.29s       | 1.87682           | 1.36919               | 0.00913649  |






    




| 8       | 17.29s       | 1.86039           | 1.36323               | 0.00913649  |






    




| 9       | 19.27s       | 1.84624           | 1.35808               | 0.00913649  |






    




| 10      | 21.28s       | 1.83413           | 1.35366               | 0.00913649  |






    




| 11      | 23.30s       | 1.82334           | 1.34969               | 0.00913649  |






    




| 12      | 25.29s       | 1.81383           | 1.34619               | 0.00913649  |






    




| 13      | 27.30s       | 1.80513           | 1.34298               | 0.00913649  |






    




| 14      | 29.28s       | 1.79752           | 1.34015               | 0.00913649  |






    




| 15      | 31.29s       | 1.79038           | 1.3375                | 0.00913649  |






    




| 16      | 33.26s       | 1.78399           | 1.33511               | 0.00913649  |






    




| 17      | 35.26s       | 1.77794           | 1.33285               | 0.00913649  |






    




| 18      | 37.26s       | 1.77225           | 1.33072               | 0.00913649  |






    




| 19      | 39.20s       | 1.76759           | 1.32897               | 0.00913649  |






    




| 20      | 41.23s       | 1.7627            | 1.32714               | 0.00913649  |






    




| 21      | 43.23s       | 1.75826           | 1.32546               | 0.00913649  |






    




| 22      | 45.25s       | 1.75437           | 1.32399               | 0.00913649  |






    




| 23      | 47.23s       | 1.75057           | 1.32254               | 0.00913649  |






    




| 24      | 49.73s       | 1.74676           | 1.32111               | 0.00913649  |






    




| 25      | 52.28s       | 1.74347           | 1.31985               | 0.00913649  |






    




| 26      | 54.32s       | 1.74015           | 1.31859               | 0.00913649  |






    




| 27      | 56.36s       | 1.73681           | 1.31733               | 0.00913649  |






    




| 28      | 58.37s       | 1.73413           | 1.31631               | 0.00913649  |






    




| 29      | 1m 0s        | 1.73112           | 1.31517               | 0.00913649  |






    




| 30      | 1m 2s        | 1.72874           | 1.31427               | 0.00913649  |






    




| 31      | 1m 4s        | 1.72619           | 1.31331               | 0.00913649  |






    




| 32      | 1m 7s        | 1.72366           | 1.31235               | 0.00913649  |






    




| 33      | 1m 10s       | 1.72147           | 1.31154               | 0.00913649  |






    




| 34      | 1m 13s       | 1.71855           | 1.31044               | 0.00913649  |






    




| 35      | 1m 15s       | 1.7169            | 1.30981               | 0.00913649  |






    




| 36      | 1m 17s       | 1.71533           | 1.30926               | 0.00913649  |






    




| 37      | 1m 21s       | 1.71313           | 1.30845               | 0.00913649  |






    




| 38      | 1m 24s       | 1.71123           | 1.30775               | 0.00913649  |






    




| 39      | 1m 26s       | 1.70973           | 1.30721               | 0.00913649  |






    




| 40      | 1m 29s       | 1.70835           | 1.30673               | 0.00913649  |






    




| 41      | 1m 31s       | 1.70675           | 1.30617               | 0.00913649  |






    




| 42      | 1m 34s       | 1.70571           | 1.30582               | 0.00913649  |






    




| 43      | 1m 36s       | 1.70426           | 1.30531               | 0.00913649  |






    




| 44      | 1m 39s       | 1.70327           | 1.30498               | 0.00913649  |






    




| 45      | 1m 41s       | 1.70191           | 1.30448               | 0.00913649  |






    




| 46      | 1m 43s       | 1.70095           | 1.30412               | 0.00913649  |






    




| 47      | 1m 45s       | 1.69989           | 1.30369               | 0.00913649  |






    




| 48      | 1m 47s       | 1.69858           | 1.30312               | 0.00913649  |






    




| 49      | 1m 49s       | 1.69663           | 1.30228               | 0.00913649  |






    




| 50      | 1m 52s       | 1.69529           | 1.30165               | 0.00913649  |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




Optimization Complete: Maximum number of passes through the data reached.






    




Computing final objective value and training RMSE.






    




       Final objective value: 1.69493






    




       Final training RMSE: 1.30151






    




Recsys training: model = factorization_recommender






    




Preparing data set.






    




    Data has 2016130 observations with 37440 users and 28588 items.






    




    Data prepared in: 6.33711s






    




Training factorization_recommender for recommendations.






    




+--------------------------------+--------------------------------------------------+----------+






    




| Parameter                      | Description                                      | Value    |






    




+--------------------------------+--------------------------------------------------+----------+






    




| num_factors                    | Factor Dimension                                 | 5        |






    




| regularization                 | L2 Regularization on Factors                     | 0.0001   |






    




| solver                         | Solver used for training                         | adagrad  |






    




| linear_regularization          | L2 Regularization on Linear Coefficients         | 1e-10    |






    




| side_data_factorization        | Assign Factors for Side Data                     | True     |






    




| max_iterations                 | Maximum Number of Iterations                     | 50       |






    




+--------------------------------+--------------------------------------------------+----------+






    




  Optimizing model using SGD; tuning step size.






    




  Using 252016 / 2016130 points for tuning the step size.






    




+---------+-------------------+------------------------------------------+






    




| Attempt | Initial Step Size | Estimated Objective Value                |






    




+---------+-------------------+------------------------------------------+






    




| 0       | 0.365459          | Not Viable                               |






    




| 1       | 0.0913649         | Not Viable                               |






    




| 2       | 0.0228412         | Not Viable                               |






    




| 3       | 0.0057103         | Not Viable                               |






    




| 4       | 0.00142758        | Not Viable                               |






    




| 5       | 0.000356894       | No Decrease (27.0033 >= 9.68896)         |






    




| 6       | 8.92235e-05       | 9.55192                                  |






    




| 7       | 4.46117e-05       | 9.3246                                   |






    




| 8       | 2.23059e-05       | 9.43828                                  |






    




| 9       | 1.11529e-05       | 9.54874                                  |






    




| 10      | 5.57647e-06       | 9.61445                                  |






    




+---------+-------------------+------------------------------------------+






    




| Final   | 4.46117e-05       | 9.3246                                   |






    




+---------+-------------------+------------------------------------------+






    




Starting Optimization.






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Iter.   | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size   |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Initial | 132us        | 9.70364           | 3.11507               |             |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| 1       | 3.08s        | 9.38809           | 3.06399               | 4.46117e-05 |






    




| 2       | 5.17s        | 9.38046           | 3.06275               | 4.46117e-05 |






    




| 3       | 7.25s        | 9.54726           | 3.08985               | 4.46117e-05 |






    




| 4       | 9.32s        | 9.7755            | 3.12656               | 4.46117e-05 |






    




| 5       | 11.46s       | 10.0445           | 3.16929               | 4.46117e-05 |






    




| 6       | 13.50s       | 10.3347           | 3.21475               | 4.46117e-05 |






    




| 7       | 15.55s       | 10.6558           | 3.26429               | 4.46117e-05 |






    




| 8       | 17.62s       | 10.9932           | 3.31558               | 4.46117e-05 |






    




| 9       | 20.38s       | DIVERGED          | DIVERGED              | 4.46117e-05 |






    




| RESET   | 21.17s       | 9.70388           | 3.11511               |             |






    




| 1       | 23.32s       | 9.4885            | 3.08034               | 2.23059e-05 |






    




| 2       | 25.41s       | 9.3726            | 3.06147               | 2.23059e-05 |






    




| 3       | 27.51s       | 9.33481           | 3.05529               | 2.23059e-05 |






    




| 4       | 29.62s       | 9.32324           | 3.05339               | 2.23059e-05 |






    




| 5       | 31.75s       | 9.32527           | 3.05373               | 2.23059e-05 |






    




| 6       | 33.89s       | 9.33216           | 3.05485               | 2.23059e-05 |






    




| 7       | 35.97s       | 9.35587           | 3.05873               | 2.23059e-05 |






    




| 8       | 38.02s       | 9.3867            | 3.06377               | 2.23059e-05 |






    




| 9       | 40.05s       | 9.42597           | 3.07017               | 2.23059e-05 |






    




| 10      | 42.01s       | 9.47391           | 3.07796               | 2.23059e-05 |






    




| 11      | 44.00s       | 9.52308           | 3.08594               | 2.23059e-05 |






    




| 12      | 45.97s       | 9.57851           | 3.0949                | 2.23059e-05 |






    




| 13      | 47.94s       | 9.63464           | 3.10396               | 2.23059e-05 |






    




| 14      | 49.90s       | 9.68339           | 3.1118                | 2.23059e-05 |






    




| 15      | 51.90s       | 9.7181            | 3.11737               | 2.23059e-05 |






    




| 16      | 53.86s       | 9.75337           | 3.12302               | 2.23059e-05 |






    




| 17      | 55.83s       | 9.78658           | 3.12834               | 2.23059e-05 |






    




| 18      | 57.78s       | 9.8263            | 3.13468               | 2.23059e-05 |






    




| 19      | 59.78s       | 9.86489           | 3.14083               | 2.23059e-05 |






    




| 20      | 1m 2s        | DIVERGED          | DIVERGED              | 2.23059e-05 |






    




| RESET   | 1m 3s        | 9.70166           | 3.11475               |             |






    




| 1       | 1m 5s        | 9.58879           | 3.09658               | 1.11529e-05 |






    




| 2       | 1m 7s        | 9.50795           | 3.0835                | 1.11529e-05 |






    




| 3       | 1m 9s        | 9.46259           | 3.07613               | 1.11529e-05 |






    




| 4       | 1m 11s       | 9.42893           | 3.07066               | 1.11529e-05 |






    




| 5       | 1m 13s       | 9.40219           | 3.0663                | 1.11529e-05 |






    




| 6       | 1m 15s       | 9.37842           | 3.06242               | 1.11529e-05 |






    




| 7       | 1m 17s       | 9.35986           | 3.05939               | 1.11529e-05 |






    




| 8       | 1m 19s       | 9.34728           | 3.05733               | 1.11529e-05 |






    




| 9       | 1m 21s       | 9.33646           | 3.05556               | 1.11529e-05 |






    




| 10      | 1m 23s       | 9.329             | 3.05434               | 1.11529e-05 |






    




| 11      | 1m 25s       | 9.32216           | 3.05322               | 1.11529e-05 |






    




| 12      | 1m 27s       | 9.31826           | 3.05258               | 1.11529e-05 |






    




| 13      | 1m 29s       | 9.31093           | 3.05138               | 1.11529e-05 |






    




| 14      | 1m 32s       | 9.30146           | 3.04983               | 1.11529e-05 |






    




| 15      | 1m 34s       | 9.29369           | 3.04855               | 1.11529e-05 |






    




| 16      | 1m 36s       | 9.2858            | 3.04726               | 1.11529e-05 |






    




| 17      | 1m 38s       | 9.27811           | 3.04599               | 1.11529e-05 |






    




| 18      | 1m 40s       | 9.27085           | 3.0448                | 1.11529e-05 |






    




| 19      | 1m 42s       | 9.26459           | 3.04377               | 1.11529e-05 |






    




| 20      | 1m 44s       | 9.25817           | 3.04272               | 1.11529e-05 |






    




| 21      | 1m 46s       | 9.25253           | 3.04179               | 1.11529e-05 |






    




| 22      | 1m 48s       | 9.24726           | 3.04093               | 1.11529e-05 |






    




| 23      | 1m 50s       | 9.24219           | 3.04009               | 1.11529e-05 |






    




| 24      | 1m 52s       | 9.23765           | 3.03934               | 1.11529e-05 |






    




| 25      | 1m 54s       | 9.23403           | 3.03875               | 1.11529e-05 |






    




| 26      | 1m 56s       | 9.23166           | 3.03836               | 1.11529e-05 |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




Optimization Complete: Maximum number of passes through the data reached (hard limit).






    




Computing final objective value and training RMSE.






    




       Final objective value: 9.23061






    




       Final training RMSE: 3.03819



In [23]:

    
(rmse_train, rmse_test) = ([], [])
for m in models:
    rmse_train.append(m['training_rmse'])
    rmse_test.append(gl.evaluation.rmse(obs_data_test['overall'], m.predict(obs_data_test,
                                                                          new_item_data=product_data_test)))



In [24]:

    
new_test = test['overall', 'username', 'beerid']
(rmse_train, rmse_test) = ([], [])
for m in models:
    rmse_train.append(m['training_rmse'])
    rmse_test.append(gl.evaluation.rmse(obs_data_test['overall'], m.predict(obs_data_test,
                                                                          new_item_data=product_data_test)))



In [25]:

    
(fig, ax) = plt.subplots(figsize=(10, 8))
[p1, p2, p3] = ax.semilogx(regularization_vals, rmse_train, 
                           regularization_vals, rmse_test, 
                           regularization_vals, len(regularization_vals) * [baseline_rmse]
                           )
#ax.set_ylim([0.45, .7])
ax.set_xlabel('Regularization', fontsize=20)
ax.set_ylabel('RMSE', fontsize=20)
ax.legend([p1, p2, p3], ["Train", "Test", "Baseline"])
plt.show()

Again, without subratings



In [11]:

    
obs_data_train = train['beerid',
                       'overall',
                      'username']
obs_data_test = test['beerid',
                       'overall',
                      'username']



In [12]:

    
product_data_train = train['beerid', 'style','breweryname']
product_data_test = test['beerid', 'style','breweryname']



In [ ]:

    
gl.ranking_factorization_recommender.create()



In [28]:

    
regularization_vals = [.001, .0001, .00001]

new_models = [gl.factorization_recommender.create(obs_data_train,
                                user_id="username",
                                item_id="beerid",
                                target="overall",
                                item_data=product_data_train,
                                max_iterations=50,
                                num_factors=5,
                                regularization=r)
                                              for r in regularization_vals]









    




Recsys training: model = factorization_recommender






    




Preparing data set.






    




    Data has 2016130 observations with 37440 users and 28588 items.






    




    Data prepared in: 4.73642s






    




Training factorization_recommender for recommendations.






    




+--------------------------------+--------------------------------------------------+----------+






    




| Parameter                      | Description                                      | Value    |






    




+--------------------------------+--------------------------------------------------+----------+






    




| num_factors                    | Factor Dimension                                 | 5        |






    




| regularization                 | L2 Regularization on Factors                     | 0.001    |






    




| solver                         | Solver used for training                         | adagrad  |






    




| linear_regularization          | L2 Regularization on Linear Coefficients         | 1e-10    |






    




| side_data_factorization        | Assign Factors for Side Data                     | True     |






    




| max_iterations                 | Maximum Number of Iterations                     | 50       |






    




+--------------------------------+--------------------------------------------------+----------+






    




  Optimizing model using SGD; tuning step size.






    




  Using 252016 / 2016130 points for tuning the step size.






    




+---------+-------------------+------------------------------------------+






    




| Attempt | Initial Step Size | Estimated Objective Value                |






    




+---------+-------------------+------------------------------------------+






    




| 0       | 0.0365459         | 4.98453                                  |






    




| 1       | 0.018273          | 5.54143                                  |






    




| 2       | 0.00913649        | 6.1053                                   |






    




| 3       | 0.00456824        | 6.70601                                  |






    




+---------+-------------------+------------------------------------------+






    




| Final   | 0.0365459         | 4.98453                                  |






    




+---------+-------------------+------------------------------------------+






    




Starting Optimization.






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Iter.   | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size   |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Initial | 100us        | 9.70587           | 3.11542               |             |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| 1       | 1.24s        | 5.31502           | 2.30376               | 0.0365459   |






    




| 2       | 2.43s        | 4.85652           | 2.2006                | 0.0365459   |






    




| 3       | 4.04s        | 4.72144           | 2.17267               | 0.0365459   |






    




| 4       | 5.49s        | 4.65276           | 2.15693               | 0.0365459   |






    




| 5       | 6.90s        | 4.60139           | 2.14507               | 0.0365459   |






    




| 6       | 8.21s        | 4.56296           | 2.13609               | 0.0365459   |






    




| 7       | 9.75s        | 4.53512           | 2.12956               | 0.0365459   |






    




| 8       | 10.95s       | 4.51109           | 2.1239                | 0.0365459   |






    




| 9       | 12.12s       | 4.49217           | 2.11899               | 0.0365459   |






    




| 10      | 13.82s       | 4.47544           | 2.11496               | 0.0365459   |






    




| 11      | 15.21s       | 4.45575           | 2.11061               | 0.0365459   |






    




| 12      | 16.61s       | 4.44698           | 2.10849               | 0.0365459   |






    




| 13      | 18.02s       | 4.43606           | 2.10582               | 0.0365459   |






    




| 14      | 19.32s       | 4.42604           | 2.1036                | 0.0365459   |






    




| 15      | 20.51s       | 4.41653           | 2.10131               | 0.0365459   |






    




| 16      | 21.68s       | 4.40921           | 2.09956               | 0.0365459   |






    




| 17      | 22.83s       | 4.40497           | 2.09715               | 0.0365459   |






    




| 18      | 24.00s       | 4.39248           | 2.09521               | 0.0365459   |






    




| 19      | 25.18s       | 4.3885            | 2.09435               | 0.0365459   |






    




| 20      | 26.81s       | 4.38171           | 2.09298               | 0.0365459   |






    




| 21      | 28.39s       | 4.37739           | 2.09198               | 0.0365459   |






    




| 22      | 29.94s       | 4.36917           | 2.0901                | 0.0365459   |






    




| 23      | 31.78s       | 4.36744           | 2.08963               | 0.0365459   |






    




| 24      | 33.80s       | 4.36202           | 2.08826               | 0.0365459   |






    




| 25      | 35.22s       | 4.35825           | 2.08694               | 0.0365459   |






    




| 26      | 36.63s       | 4.35169           | 2.0854                | 0.0365459   |






    




| 27      | 38.07s       | 4.34984           | 2.08521               | 0.0365459   |






    




| 28      | 40.16s       | 4.34772           | 2.08468               | 0.0365459   |






    




| 29      | 41.55s       | 4.33978           | 2.08279               | 0.0365459   |






    




| 30      | 42.81s       | 4.34043           | 2.08286               | 0.0365459   |






    




| 31      | 44.19s       | 4.33579           | 2.08172               | 0.0365459   |






    




| 32      | 45.65s       | 4.33426           | 2.08133               | 0.0365459   |






    




| 33      | 47.04s       | 4.33064           | 2.0805                | 0.0365459   |






    




| 34      | 48.45s       | 4.32605           | 2.07952               | 0.0365459   |






    




| 35      | 50.44s       | 4.32615           | 2.07912               | 0.0365459   |






    




| 36      | 52.01s       | 4.31994           | 2.07775               | 0.0365459   |






    




| 37      | 53.50s       | 4.31881           | 2.07784               | 0.0365459   |






    




| 38      | 55.09s       | 4.3165            | 2.07727               | 0.0365459   |






    




| 39      | 56.53s       | 4.31604           | 2.07708               | 0.0365459   |






    




| 40      | 57.93s       | 4.30883           | 2.07549               | 0.0365459   |






    




| 41      | 59.17s       | 4.31082           | 2.0759                | 0.0365459   |






    




| 42      | 1m 0s        | 4.30881           | 2.07533               | 0.0365459   |






    




| 43      | 1m 1s        | 4.30936           | 2.07452               | 0.0365459   |






    




| 44      | 1m 3s        | 4.29778           | 2.07268               | 0.0365459   |






    




| 45      | 1m 4s        | 4.30269           | 2.07382               | 0.0365459   |






    




| 46      | 1m 5s        | 4.30017           | 2.07335               | 0.0365459   |






    




| 47      | 1m 7s        | 4.29892           | 2.07302               | 0.0365459   |






    




| 48      | 1m 9s        | 4.29737           | 2.07257               | 0.0365459   |






    




| 49      | 1m 10s       | 4.29568           | 2.07159               | 0.0365459   |






    




| 50      | 1m 12s       | 4.29207           | 2.07081               | 0.0365459   |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




Optimization Complete: Maximum number of passes through the data reached.






    




Computing final objective value and training RMSE.






    




       Final objective value: 4.29561






    




       Final training RMSE: 2.07167






    




Recsys training: model = factorization_recommender






    




Preparing data set.






    




    Data has 2016130 observations with 37440 users and 28588 items.






    




    Data prepared in: 4.53475s






    




Training factorization_recommender for recommendations.






    




+--------------------------------+--------------------------------------------------+----------+






    




| Parameter                      | Description                                      | Value    |






    




+--------------------------------+--------------------------------------------------+----------+






    




| num_factors                    | Factor Dimension                                 | 5        |






    




| regularization                 | L2 Regularization on Factors                     | 0.0001   |






    




| solver                         | Solver used for training                         | adagrad  |






    




| linear_regularization          | L2 Regularization on Linear Coefficients         | 1e-10    |






    




| side_data_factorization        | Assign Factors for Side Data                     | True     |






    




| max_iterations                 | Maximum Number of Iterations                     | 50       |






    




+--------------------------------+--------------------------------------------------+----------+






    




  Optimizing model using SGD; tuning step size.






    




  Using 252016 / 2016130 points for tuning the step size.






    




+---------+-------------------+------------------------------------------+






    




| Attempt | Initial Step Size | Estimated Objective Value                |






    




+---------+-------------------+------------------------------------------+






    




| 0       | 0.365459          | 4.11757                                  |






    




| 1       | 0.18273           | 4.13411                                  |






    




| 2       | 0.0913649         | 4.06084                                  |






    




| 3       | 0.0456824         | 4.21883                                  |






    




| 4       | 0.0228412         | 4.5177                                   |






    




| 5       | 0.0114206         | 4.92612                                  |






    




+---------+-------------------+------------------------------------------+






    




| Final   | 0.0913649         | 4.06084                                  |






    




+---------+-------------------+------------------------------------------+






    




Starting Optimization.






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Iter.   | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size   |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Initial | 116us        | 9.70563           | 3.11539               |             |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| 1       | 1.24s        | 4.74109           | 2.17275               | 0.0913649   |






    




| 2       | 2.39s        | 4.49107           | 2.09786               | 0.0913649   |






    




| 3       | 3.53s        | 4.37403           | 2.05701               | 0.0913649   |






    




| 4       | 4.70s        | 4.30689           | 2.03544               | 0.0913649   |






    




| 5       | 5.86s        | 4.26242           | 2.02111               | 0.0913649   |






    




| 6       | 7.01s        | 4.24258           | 2.01265               | 0.0913649   |






    




| 7       | 8.16s        | 4.22528           | 2.00471               | 0.0913649   |






    




| 8       | 9.32s        | 4.20777           | 1.99798               | 0.0913649   |






    




| 9       | 10.45s       | 4.18638           | 1.99093               | 0.0913649   |






    




| 10      | 11.62s       | 4.17498           | 1.98653               | 0.0913649   |






    




| 11      | 12.75s       | 4.15824           | 1.98122               | 0.0913649   |






    




| 12      | 13.90s       | 4.13866           | 1.97596               | 0.0913649   |






    




| 13      | 15.06s       | 4.13586           | 1.97419               | 0.0913649   |






    




| 14      | 16.20s       | 4.12538           | 1.97048               | 0.0913649   |






    




| 15      | 17.36s       | 4.12134           | 1.96812               | 0.0913649   |






    




| 16      | 18.48s       | 4.10966           | 1.96422               | 0.0913649   |






    




| 17      | 19.64s       | 4.10593           | 1.96231               | 0.0913649   |






    




| 18      | 20.81s       | 4.09821           | 1.95957               | 0.0913649   |






    




| 19      | 21.96s       | 4.07413           | 1.95391               | 0.0913649   |






    




| 20      | 23.12s       | 4.08789           | 1.95598               | 0.0913649   |






    




| 21      | 24.26s       | 4.06972           | 1.95135               | 0.0913649   |






    




| 22      | 25.48s       | 4.0657            | 1.94975               | 0.0913649   |






    




| 23      | 26.63s       | 4.06168           | 1.94835               | 0.0913649   |






    




| 24      | 27.81s       | 4.05513           | 1.9463                | 0.0913649   |






    




| 25      | 28.96s       | 4.05675           | 1.94581               | 0.0913649   |






    




| 26      | 30.10s       | 4.04961           | 1.94344               | 0.0913649   |






    




| 27      | 31.28s       | 4.04002           | 1.94087               | 0.0913649   |






    




| 28      | 32.45s       | 4.04279           | 1.94073               | 0.0913649   |






    




| 29      | 33.60s       | 4.04266           | 1.93997               | 0.0913649   |






    




| 30      | 34.77s       | 4.03225           | 1.93712               | 0.0913649   |






    




| 31      | 35.89s       | 4.02924           | 1.93603               | 0.0913649   |






    




| 32      | 37.15s       | 4.0252            | 1.93461               | 0.0913649   |






    




| 33      | 38.29s       | 4.02919           | 1.93456               | 0.0913649   |






    




| 34      | 39.43s       | 4.0208            | 1.93231               | 0.0913649   |






    




| 35      | 40.59s       | 4.02068           | 1.93174               | 0.0913649   |






    




| 36      | 41.73s       | 4.00825           | 1.92859               | 0.0913649   |






    




| 37      | 42.85s       | 4.01265           | 1.92924               | 0.0913649   |






    




| 38      | 44.00s       | 4.00545           | 1.92722               | 0.0913649   |






    




| 39      | 45.16s       | 4.00787           | 1.92728               | 0.0913649   |






    




| 40      | 46.30s       | 4.00765           | 1.92659               | 0.0913649   |






    




| 41      | 47.46s       | 3.99674           | 1.92385               | 0.0913649   |






    




| 42      | 48.62s       | 4.00063           | 1.92431               | 0.0913649   |






    




| 43      | 49.76s       | 4.00159           | 1.92384               | 0.0913649   |






    




| 44      | 50.91s       | 3.99773           | 1.92258               | 0.0913649   |






    




| 45      | 52.12s       | 3.99037           | 1.92061               | 0.0913649   |






    




| 46      | 53.26s       | 3.99133           | 1.92052               | 0.0913649   |






    




| 47      | 54.39s       | 3.98676           | 1.9191                | 0.0913649   |






    




| 48      | 55.54s       | 3.99044           | 1.9194                | 0.0913649   |






    




| 49      | 56.75s       | 3.98941           | 1.91865               | 0.0913649   |






    




| 50      | 57.88s       | 3.98067           | 1.91654               | 0.0913649   |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




Optimization Complete: Maximum number of passes through the data reached.






    




Computing final objective value and training RMSE.






    




       Final objective value: 3.98725






    




       Final training RMSE: 1.91826






    




Recsys training: model = factorization_recommender






    




Preparing data set.






    




    Data has 2016130 observations with 37440 users and 28588 items.






    




    Data prepared in: 3.57065s






    




Training factorization_recommender for recommendations.






    




+--------------------------------+--------------------------------------------------+----------+






    




| Parameter                      | Description                                      | Value    |






    




+--------------------------------+--------------------------------------------------+----------+






    




| num_factors                    | Factor Dimension                                 | 5        |






    




| regularization                 | L2 Regularization on Factors                     | 1e-05    |






    




| solver                         | Solver used for training                         | adagrad  |






    




| linear_regularization          | L2 Regularization on Linear Coefficients         | 1e-10    |






    




| side_data_factorization        | Assign Factors for Side Data                     | True     |






    




| max_iterations                 | Maximum Number of Iterations                     | 50       |






    




+--------------------------------+--------------------------------------------------+----------+






    




  Optimizing model using SGD; tuning step size.






    




  Using 252016 / 2016130 points for tuning the step size.






    




+---------+-------------------+------------------------------------------+






    




| Attempt | Initial Step Size | Estimated Objective Value                |






    




+---------+-------------------+------------------------------------------+






    




| 0       | 3.65459           | 4.30825                                  |






    




| 1       | 1.8273            | 3.1871                                   |






    




| 2       | 0.913649          | 2.71582                                  |






    




| 3       | 0.456824          | 2.66137                                  |






    




| 4       | 0.228412          | 2.81649                                  |






    




| 5       | 0.114206          | 3.20318                                  |






    




| 6       | 0.057103          | 3.63032                                  |






    




+---------+-------------------+------------------------------------------+






    




| Final   | 0.456824          | 2.66137                                  |






    




+---------+-------------------+------------------------------------------+






    




Starting Optimization.






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Iter.   | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size   |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Initial | 169us        | 9.70598           | 3.11544               |             |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| 1       | 1.19s        | 4.51443           | 2.11084               | 0.456824    |






    




| 2       | 2.36s        | 4.23028           | 2.00969               | 0.456824    |






    




| 3       | 3.56s        | 4.02852           | 1.93511               | 0.456824    |






    




| 4       | 4.73s        | 3.93147           | 1.89329               | 0.456824    |






    




| 5       | 5.89s        | 3.88011           | 1.86834               | 0.456824    |






    




| 6       | 7.04s        | 3.84312           | 1.85106               | 0.456824    |






    




| 7       | 8.20s        | 3.81181           | 1.83823               | 0.456824    |






    




| 8       | 9.40s        | 3.79592           | 1.82962               | 0.456824    |






    




| 9       | 10.56s       | 3.7802            | 1.82225               | 0.456824    |






    




| 10      | 11.72s       | 3.77501           | 1.81781               | 0.456824    |






    




| 11      | 12.88s       | 3.76              | 1.81206               | 0.456824    |






    




| 12      | 14.05s       | 3.75193           | 1.80857               | 0.456824    |






    




| 13      | 15.19s       | 3.74296           | 1.80531               | 0.456824    |






    




| 14      | 16.36s       | 3.71337           | 1.79893               | 0.456824    |






    




| 15      | 17.52s       | 3.72482           | 1.80041               | 0.456824    |






    




| 16      | 18.67s       | 3.70574           | 1.7964                | 0.456824    |






    




| 17      | 19.82s       | 3.70041           | 1.79495               | 0.456824    |






    




| 18      | 20.99s       | 3.69379           | 1.7936                | 0.456824    |






    




| 19      | 22.32s       | 3.68764           | 1.79235               | 0.456824    |






    




| 20      | 23.66s       | 3.68513           | 1.79139               | 0.456824    |






    




| 21      | 24.89s       | 3.67686           | 1.78955               | 0.456824    |






    




| 22      | 26.11s       | 3.66919           | 1.78831               | 0.456824    |






    




| 23      | 27.45s       | 3.6699            | 1.78813               | 0.456824    |






    




| 24      | 28.60s       | 3.66658           | 1.78751               | 0.456824    |






    




| 25      | 29.79s       | 3.6563            | 1.78558               | 0.456824    |






    




| 26      | 30.94s       | 3.65213           | 1.78492               | 0.456824    |






    




| 27      | 32.11s       | 3.64681           | 1.784                 | 0.456824    |






    




| 28      | 33.27s       | 3.64784           | 1.78388               | 0.456824    |






    




| 29      | 34.43s       | 3.64232           | 1.78316               | 0.456824    |






    




| 30      | 35.59s       | 3.63981           | 1.78276               | 0.456824    |






    




| 31      | 36.75s       | 3.62779           | 1.78052               | 0.456824    |






    




| 32      | 37.90s       | 3.63107           | 1.7816                | 0.456824    |






    




| 33      | 39.08s       | 3.62392           | 1.7804                | 0.456824    |






    




| 34      | 40.24s       | 3.62367           | 1.78034               | 0.456824    |






    




| 35      | 41.38s       | 3.62294           | 1.78027               | 0.456824    |






    




| 36      | 42.59s       | 3.61341           | 1.77864               | 0.456824    |






    




| 37      | 43.73s       | 3.6168            | 1.77958               | 0.456824    |






    




| 38      | 44.88s       | 3.61577           | 1.77923               | 0.456824    |






    




| 39      | 46.01s       | 3.61054           | 1.77837               | 0.456824    |






    




| 40      | 47.16s       | 3.60524           | 1.77747               | 0.456824    |






    




| 41      | 48.30s       | 3.60412           | 1.77744               | 0.456824    |






    




| 42      | 49.47s       | 3.59984           | 1.77668               | 0.456824    |






    




| 43      | 50.63s       | 3.60133           | 1.777                 | 0.456824    |






    




| 44      | 51.83s       | 3.60002           | 1.77676               | 0.456824    |






    




| 45      | 53.00s       | 3.59255           | 1.77567               | 0.456824    |






    




| 46      | 54.16s       | 3.59236           | 1.7759                | 0.456824    |






    




| 47      | 55.35s       | 3.59098           | 1.77551               | 0.456824    |






    




| 48      | 56.56s       | 3.59034           | 1.77574               | 0.456824    |






    




| 49      | 57.71s       | 3.58519           | 1.77471               | 0.456824    |






    




| 50      | 58.90s       | 3.58583           | 1.7749                | 0.456824    |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




Optimization Complete: Maximum number of passes through the data reached.






    




Computing final objective value and training RMSE.






    




       Final objective value: 3.55829






    




       Final training RMSE: 1.76712



In [ ]:

    
(rmse_train, rmse_test) = ([], [])
for m in new_models:
    rmse_train.append(m['training_rmse'])
    rmse_test.append(gl.evaluation.rmse(obs_data_test['overall'], m.predict(obs_data_test,
                                                                          new_item_data=product_data_test)))



In [ ]:

    
(fig, ax) = plt.subplots(figsize=(10, 8))
[p1, p2, p3] = ax.semilogx(regularization_vals, rmse_train, 
                           regularization_vals, rmse_test, 
                           regularization_vals, len(regularization_vals) * [baseline_rmse]
                           )
#ax.set_ylim([0.45, .7])
ax.set_xlabel('Regularization', fontsize=20)
ax.set_ylabel('RMSE', fontsize=20)
ax.legend([p1, p2, p3], ["Train", "Test", "Baseline"])
plt.show()

Ranking Factorization Model



In [21]:

    
obs_data_train = train['beerid',
                       'overall',
                       'userid']
obs_data_test = test['beerid',
                      'overall',
                      'userid']



In [22]:

    
product_data_train = train['beerid', 'style','breweryname']
product_data_test = test['beerid', 'style','breweryname']



In [21]:

    
model = gl.recommender.ranking_factorization_recommender.create(obs_data_train,
                                                    user_id='userid',
                                                    item_id='beerid',
                                                    target='overall',
                                                    item_data=product_data_train)









    




Recsys training: model = ranking_factorization_recommender






    




Preparing data set.






    




    Data has 2401680 observations with 39805 users and 28588 items.






    




    Data prepared in: 16.5124s






    




Training ranking_factorization_recommender for recommendations.






    




+--------------------------------+--------------------------------------------------+----------+






    




| Parameter                      | Description                                      | Value    |






    




+--------------------------------+--------------------------------------------------+----------+






    




| num_factors                    | Factor Dimension                                 | 32       |






    




| regularization                 | L2 Regularization on Factors                     | 1e-09    |






    




| solver                         | Solver used for training                         | adagrad  |






    




| linear_regularization          | L2 Regularization on Linear Coefficients         | 1e-09    |






    




| ranking_regularization         | Rank-based Regularization Weight                 | 0.25     |






    




| side_data_factorization        | Assign Factors for Side Data                     | True     |






    




| max_iterations                 | Maximum Number of Iterations                     | 25       |






    




+--------------------------------+--------------------------------------------------+----------+






    




  Optimizing model using SGD; tuning step size.






    




  Using 300210 / 2401680 points for tuning the step size.






    




+---------+-------------------+------------------------------------------+






    




| Attempt | Initial Step Size | Estimated Objective Value                |






    




+---------+-------------------+------------------------------------------+






    




| 0       | 12.5              | Not Viable                               |






    




| 1       | 3.125             | Not Viable                               |






    




| 2       | 0.78125           | 7.97431                                  |






    




| 3       | 0.390625          | 5.77227                                  |






    




| 4       | 0.195312          | 5.61225                                  |






    




| 5       | 0.0976562         | 7.34201                                  |






    




| 6       | 0.0488281         | 10.0607                                  |






    




| 7       | 0.0244141         | 11.452                                   |






    




+---------+-------------------+------------------------------------------+






    




| Final   | 0.195312          | 5.61225                                  |






    




+---------+-------------------+------------------------------------------+






    




Starting Optimization.






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Iter.   | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size   |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| Initial | 580us        | 19.0497           | 3.11712               |             |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




| 1       | 11.14s       | 10.1766           | 2.52339               | 0.195312    |






    




| 2       | 19.52s       | 8.9203            | 2.30646               | 0.195312    |






    




| 3       | 31.02s       | 8.16278           | 2.19179               | 0.195312    |






    




| 4       | 41.86s       | 7.77661           | 2.13534               | 0.195312    |






    




| 5       | 1m 0s        | 7.50495           | 2.09424               | 0.195312    |






    




| 6       | 1m 12s       | 7.3163            | 2.06185               | 0.195312    |






    




| 7       | 1m 22s       | 7.18057           | 2.04171               | 0.195312    |






    




| 8       | 1m 33s       | 7.0646            | 2.02087               | 0.195312    |






    




| 9       | 1m 45s       | 6.97034           | 2.00603               | 0.195312    |






    




| 10      | 1m 55s       | 6.89574           | 1.99443               | 0.195312    |






    




| 11      | 2m 4s        | 6.82627           | 1.98117               | 0.195312    |






    




| 12      | 2m 13s       | 6.77796           | 1.97179               | 0.195312    |






    




| 13      | 2m 21s       | 6.72605           | 1.9622                | 0.195312    |






    




| 14      | 2m 29s       | 6.682             | 1.95846               | 0.195312    |






    




| 15      | 2m 37s       | 6.64337           | 1.94822               | 0.195312    |






    




| 16      | 2m 45s       | 6.60703           | 1.94256               | 0.195312    |






    




| 17      | 2m 57s       | 6.56931           | 1.93726               | 0.195312    |






    




| 18      | 3m 8s        | 6.54388           | 1.93196               | 0.195312    |






    




| 19      | 3m 19s       | 6.52065           | 1.92855               | 0.195312    |






    




| 20      | 3m 27s       | 6.48674           | 1.92002               | 0.195312    |






    




| 21      | 3m 36s       | 6.47323           | 1.9197                | 0.195312    |






    




| 22      | 3m 45s       | 6.44697           | 1.91603               | 0.195312    |






    




| 23      | 3m 54s       | 6.42417           | 1.91036               | 0.195312    |






    




| 24      | 4m 2s        | 6.40699           | 1.90932               | 0.195312    |






    




| 25      | 4m 10s       | 6.38896           | 1.90422               | 0.195312    |






    




+---------+--------------+-------------------+-----------------------+-------------+






    




Optimization Complete: Maximum number of passes through the data reached.






    




Computing final objective value and training RMSE.






    




       Final objective value: 6.41255






    




       Final training RMSE: 1.86199



In [32]:

    
model.recommend(users=[409311])









    Out[32]:





    
        userid
        beerid
        score
        rank
    
    
        409311
        7286
        15.8626083873
        1
    
    
        409311
        7688
        15.2993126811
        2
    
    
        409311
        48
        15.2703035773
        3
    
    
        409311
        585
        15.2374856771
        4
    
    
        409311
        11327
        15.166824341
        5
    
    
        409311
        13
        15.0901842715
        6
    
    
        409311
        16
        15.0901675524
        7
    
    
        409311
        8936
        15.0643333496
        8
    
    
        409311
        680
        15.0313988031
        9
    
    
        409311
        589
        14.9974100293
        10
    

[10 rows x 4 columns]



In [35]:

    
SFdata[SFdata['beerid'] == 7286]









    Out[35]:





    
        beerid
        beername
        breweryname
        overall
        userid
        username
        style
    
    
        7286
        Mac and Jacks African
Amber Ale ...
        Mac and Jacks Brewing
Company ...
        14
        70964
        nickd717
        Amber Ale
    
    
        7286
        Mac and Jacks African
Amber Ale ...
        Mac and Jacks Brewing
Company ...
        14
        4979
        ClarkVV
        Amber Ale
    
    
        7286
        Mac and Jacks African
Amber Ale ...
        Mac and Jacks Brewing
Company ...
        14
        93397
        bulldogops
        Amber Ale
    
    
        7286
        Mac and Jacks African
Amber Ale ...
        Mac and Jacks Brewing
Company ...
        17
        129516
        tightslice
        Amber Ale
    
    
        7286
        Mac and Jacks African
Amber Ale ...
        Mac and Jacks Brewing
Company ...
        15
        94786
        brucebruce81
        Amber Ale
    
    
        7286
        Mac and Jacks African
Amber Ale ...
        Mac and Jacks Brewing
Company ...
        13
        359934
        bartzluke
        Amber Ale
    
    
        7286
        Mac and Jacks African
Amber Ale ...
        Mac and Jacks Brewing
Company ...
        14
        404749
        teamsale
        Amber Ale
    
    
        7286
        Mac and Jacks African
Amber Ale ...
        Mac and Jacks Brewing
Company ...
        13
        386521
        sebastokrator
        Amber Ale
    
    
        7286
        Mac and Jacks African
Amber Ale ...
        Mac and Jacks Brewing
Company ...
        17
        137798
        ads135
        Amber Ale
    
    
        7286
        Mac and Jacks African
Amber Ale ...
        Mac and Jacks Brewing
Company ...
        8
        95172
        jvhemert
        Amber Ale
    

[? rows x 7 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use sf.materialize() to force materialization.



In [43]:

    
model.get_similar_items(items=[7286, 24542])









    Out[43]:





    
        beerid
        similar
        score
        rank
    
    
        7286
        204413
        0.656512975693
        1
    
    
        7286
        18440
        0.644244611263
        2
    
    
        7286
        97648
        0.634415686131
        3
    
    
        7286
        52644
        0.601191461086
        4
    
    
        7286
        92877
        0.598866164684
        5
    
    
        7286
        16136
        0.595318078995
        6
    
    
        7286
        91163
        0.573130786419
        7
    
    
        7286
        141073
        0.568260610104
        8
    
    
        7286
        122293
        0.554176092148
        9
    
    
        7286
        69823
        0.552317678928
        10
    

[20 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [42]:

    
SFdata[SFdata['username'] == 'monty_pilsner']









    Out[42]:





    
        beerid
        beername
        breweryname
        overall
        userid
        username
        style
    
    
        24542
        Georgetown Mannys Pale
Ale ...
        Georgetown Brewing
Company ...
        18
        409311
        monty_pilsner
        American Pale Ale
    

[? rows x 7 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use sf.materialize() to force materialization.



In [44]:

    
model.save('/Users/Gevurtz/galvanize/beer_rec_project/beer-recommender/webapp/rankmodel')



In [45]:

    
l_model = gl.load_model('/Users/Gevurtz/galvanize/beer_rec_project/beer-recommender/webapp/rankmodel')



In [29]:

    
similarity_model_cos = gl.recommender.item_similarity_recommender.create(obs_data_train,
                                                          user_id='userid',
                                                          item_id='beerid', 
                                                          target='overall',
                                                          similarity_type='cosine')









    




Recsys training: model = item_similarity






    




Preparing data set.






    




    Data has 2401680 observations with 39805 users and 28588 items.






    




    Data prepared in: 2.84257s






    




Training model from provided data.






    




Gathering per-item and per-user statistics.






    




+--------------------------------+------------+






    




| Elapsed Time (Item Statistics) | % Complete |






    




+--------------------------------+------------+






    




| 74.645ms                       | 2.5        |






    




| 165.286ms                      | 100        |






    




+--------------------------------+------------+






    




Setting up lookup tables.






    




Processing data in one pass using dense lookup tables.






    




+-------------------------------------+------------------+-----------------+






    




| Elapsed Time (Constructing Lookups) | Total % Complete | Items Processed |






    




+-------------------------------------+------------------+-----------------+






    




| 3.17s                               | 0                | 0               |






    




| 4.40s                               | 3                | 873             |






    




| 5.72s                               | 4                | 1164            |






    




| 6.30s                               | 4.75             | 1399            |






    




| 7.40s                               | 7                | 2011            |






    




| 8.19s                               | 9                | 2575            |






    




| 9.19s                               | 11               | 3203            |






    




| 10.29s                              | 13.25            | 3824            |






    




| 11.24s                              | 15.75            | 4571            |






    




| 12.38s                              | 17.5             | 5013            |






    




| 13.49s                              | 20.25            | 5799            |






    




| 14.22s                              | 21.5             | 6209            |






    




| 15.22s                              | 23.25            | 6709            |






    




| 16.28s                              | 25.25            | 7273            |






    




| 17.21s                              | 27.5             | 7884            |






    




| 18.27s                              | 29.5             | 8449            |






    




| 19.35s                              | 30.75            | 8798            |






    




| 20.21s                              | 32               | 9183            |






    




| 21.33s                              | 34.75            | 9956            |






    




| 22.39s                              | 38               | 10924           |






    




| 23.26s                              | 38.75            | 11093           |






    




| 24.24s                              | 40.75            | 11690           |






    




| 25.25s                              | 42.5             | 12154           |






    




| 26.31s                              | 44.75            | 12810           |






    




| 27.28s                              | 45.75            | 13125           |






    




| 28.24s                              | 47               | 13480           |






    




| 29.31s                              | 49.25            | 14116           |






    




| 30.26s                              | 50.5             | 14493           |






    




| 31.26s                              | 52               | 14915           |






    




| 32.27s                              | 53.75            | 15388           |






    




| 33.28s                              | 55.25            | 15804           |






    




| 34.27s                              | 56.75            | 16262           |






    




| 35.33s                              | 58.75            | 16847           |






    




| 36.37s                              | 60.25            | 17243           |






    




| 37.26s                              | 61.25            | 17580           |






    




| 38.30s                              | 63               | 18011           |






    




| 39.58s                              | 64.25            | 18397           |






    




| 40.27s                              | 66               | 18879           |






    




| 41.32s                              | 67               | 19171           |






    




| 42.27s                              | 68.5             | 19641           |






    




| 43.28s                              | 70.25            | 20108           |






    




| 44.29s                              | 71.75            | 20582           |






    




| 45.33s                              | 73.5             | 21057           |






    




| 46.29s                              | 75.75            | 21662           |






    




| 47.28s                              | 77.5             | 22158           |






    




| 48.31s                              | 80               | 22911           |






    




| 49.29s                              | 82.5             | 23629           |






    




| 50.30s                              | 83.5             | 23942           |






    




| 51.30s                              | 85.25            | 24378           |






    




| 52.30s                              | 86.25            | 24663           |






    




| 53.30s                              | 87               | 24924           |






    




| 54.33s                              | 88               | 25161           |






    




| 55.31s                              | 89               | 25493           |






    




| 56.31s                              | 89.75            | 25672           |






    




| 57.32s                              | 90.5             | 25910           |






    




| 58.36s                              | 91.75            | 26247           |






    




| 59.31s                              | 92.5             | 26476           |






    




| 1m 0s                               | 93.5             | 26764           |






    




| 1m 1s                               | 94.75            | 27116           |






    




| 1m 2s                               | 95.75            | 27384           |






    




| 1m 3s                               | 96.25            | 27585           |






    




| 1m 4s                               | 97.25            | 27849           |






    




| 1m 5s                               | 98               | 28069           |






    




| 1m 6s                               | 99               | 28316           |






    




| 1m 7s                               | 99.5             | 28463           |






    




| 1m 8s                               | 99.75            | 28584           |






    




| 1m 25s                              | 100              | 28588           |






    




+-------------------------------------+------------------+-----------------+






    




Finalizing lookup tables.






    




Generating candidate set for working with new users.






    




Finished training in 85.6106s



In [25]:

    
similarity_model.get_similar_items([24542])









    Out[25]:





    
        beerid
        similar
        score
        rank
    
    
        24542
        7286
        0.220779240131
        1
    
    
        24542
        10253
        0.202020227909
        2
    
    
        24542
        99720
        0.1954023242
        3
    
    
        24542
        1124
        0.194915235043
        4
    
    
        24542
        15083
        0.194690287113
        5
    
    
        24542
        15565
        0.193103432655
        6
    
    
        24542
        43643
        0.190476179123
        7
    
    
        24542
        66545
        0.188679218292
        8
    
    
        24542
        1108
        0.188153326511
        9
    
    
        24542
        5011
        0.18359375
        10
    

[10 rows x 4 columns]



In [26]:

    
similarity_model.get_current_options()









    Out[26]:





{'degree_approximation_threshold': 4096,
 'item_id': 'beerid',
 'max_data_passes': 4096,
 'max_item_neighborhood_size': 64,
 'nearest_neighbors_interaction_proportion_threshold': 0.05,
 'seed_item_set_size': 50,
 'similarity_type': 'jaccard',
 'sparse_density_estimation_sample_size': 4096,
 'target': 'overall',
 'target_memory_usage': 8589934592,
 'threshold': 0.001,
 'training_method': 'auto',
 'user_id': 'userid'}



In [31]:

    
similarity_model_cos.get_similar_items([24542])









    Out[31]:





    
        beerid
        similar
        score
        rank
    
    
        24542
        344928
        1.0
        1
    
    
        24542
        217171
        0.871789693832
        2
    
    
        24542
        180390
        0.847396790981
        3
    
    
        24542
        341646
        0.628716289997
        4
    
    
        24542
        146458
        0.493217110634
        5
    
    
        24542
        950
        0.483016133308
        6
    
    
        24542
        32586
        0.434112012386
        7
    
    
        24542
        281428
        0.425775468349
        8
    
    
        24542
        270966
        0.40780967474
        9
    
    
        24542
        268000
        0.399466693401
        10
    

[10 rows x 4 columns]



In [ ]:

    
similarity_model.save('/Users/Gevurtz/galvanize/beer_rec_project/beer-recommender/webapp/similarity')

	appearance	aroma	beerid	beername	breweryname	overall	palate	style	taste	text	userid	username
0	3	6	256594	Singin River Handys Gold	Singin River Brewing Company	13	4	Golden Ale/Blond Ale	6	pours a medium golden yellowish color aromas o...	28586	oldrtybastrd
1	4	6	256594	Singin River Handys Gold	Singin River Brewing Company	12	3	Golden Ale/Blond Ale	6	can at home pours clear gold with a large pill...	9357	BeerandBlues2
2	4	6	256594	Singin River Handys Gold	Singin River Brewing Company	9	4	Golden Ale/Blond Ale	6	can slightly hazed goldsmall puffy white head ...	151324	Patrickctenchi
3	5	7	256594	Singin River Handys Gold	Singin River Brewing Company	10	2	Golden Ale/Blond Ale	5	can orange peach biscuit malt and straw aroma ...	10924	shrubber85
4	4	7	256594	Singin River Handys Gold	Singin River Brewing Company	15	4	Golden Ale/Blond Ale	7	poured from a 12 oz can into a pint glass the ...	327076	ordybill

appearance	aroma	beerid	overall	palate	taste	username
4	6	256594	13	3	6	Itzjerm
5	5	256594	15	4	7	Aggiebob
3	6	256598	13	3	7	LiMuBai
4	5	256598	13	4	6	Aggiebob
5	6	403526	14	5	6	Patrickctenchi
5	7	403526	13	5	6	Aggiebob
3	6	403526	12	3	6	sonnycheeba
3	6	375884	14	3	6	crabbypantalones
3	6	375884	12	3	6	kerenmk
4	7	122302	14	3	7	TXBadger

userid	beerid	score	rank
409311	7286	15.8626083873	1
409311	7688	15.2993126811	2
409311	48	15.2703035773	3
409311	585	15.2374856771	4
409311	11327	15.166824341	5
409311	13	15.0901842715	6
409311	16	15.0901675524	7
409311	8936	15.0643333496	8
409311	680	15.0313988031	9
409311	589	14.9974100293	10

beerid	beername	breweryname	overall	userid	username	style
7286	Mac and Jacks African Amber Ale ...	Mac and Jacks Brewing Company ...	14	70964	nickd717	Amber Ale
7286	Mac and Jacks African Amber Ale ...	Mac and Jacks Brewing Company ...	14	4979	ClarkVV	Amber Ale
7286	Mac and Jacks African Amber Ale ...	Mac and Jacks Brewing Company ...	14	93397	bulldogops	Amber Ale
7286	Mac and Jacks African Amber Ale ...	Mac and Jacks Brewing Company ...	17	129516	tightslice	Amber Ale
7286	Mac and Jacks African Amber Ale ...	Mac and Jacks Brewing Company ...	15	94786	brucebruce81	Amber Ale
7286	Mac and Jacks African Amber Ale ...	Mac and Jacks Brewing Company ...	13	359934	bartzluke	Amber Ale
7286	Mac and Jacks African Amber Ale ...	Mac and Jacks Brewing Company ...	14	404749	teamsale	Amber Ale
7286	Mac and Jacks African Amber Ale ...	Mac and Jacks Brewing Company ...	13	386521	sebastokrator	Amber Ale
7286	Mac and Jacks African Amber Ale ...	Mac and Jacks Brewing Company ...	17	137798	ads135	Amber Ale
7286	Mac and Jacks African Amber Ale ...	Mac and Jacks Brewing Company ...	8	95172	jvhemert	Amber Ale

beerid	similar	score	rank
7286	204413	0.656512975693	1
7286	18440	0.644244611263	2
7286	97648	0.634415686131	3
7286	52644	0.601191461086	4
7286	92877	0.598866164684	5
7286	16136	0.595318078995	6
7286	91163	0.573130786419	7
7286	141073	0.568260610104	8
7286	122293	0.554176092148	9
7286	69823	0.552317678928	10

beerid	similar	score	rank
24542	7286	0.220779240131	1
24542	10253	0.202020227909	2
24542	99720	0.1954023242	3
24542	1124	0.194915235043	4
24542	15083	0.194690287113	5
24542	15565	0.193103432655	6
24542	43643	0.190476179123	7
24542	66545	0.188679218292	8
24542	1108	0.188153326511	9
24542	5011	0.18359375	10

beerid	similar	score	rank
24542	344928	1.0	1
24542	217171	0.871789693832	2
24542	180390	0.847396790981	3
24542	341646	0.628716289997	4
24542	146458	0.493217110634	5
24542	950	0.483016133308	6
24542	32586	0.434112012386	7
24542	281428	0.425775468349	8
24542	270966	0.40780967474	9
24542	268000	0.399466693401	10