This notebooks lays out the steps we have been using to run Hermes

It saves intermediary products then reads those products back into the context for the next step

This helps prevent any losses during the run

First set up your spark context (if necessary) and load the hermes.zip into context

hermes.zip can be found on GitHub at https://github.com/Lab41/hermes.git



In [ ]:

    
#This block of code will set up a spark content and sql context if you are running locally
#If you are on cluster or have deployed spark a different way you don't need this
from pyspark import SparkContext, SparkConf 
from pyspark.sql import SQLContext 

try:
    sc = SparkContext()
except:
    sc = SparkContext._active_spark_context

sqlCtx = SQLContext(sc)



In [ ]:

    
sc.addPyFile('hermes/hermes.zip')
from src import hermes_run_script
import pandas as pd

Read in the data - for instance the MovieLens data can be found at: http://grouplens.org/datasets/movielens/

The data then needs to be transformed into json files. Each dataset has its own ETL folder in hermes/src/utils

Once transformed then you should load the json files



In [ ]:

    
movies = sqlCtx.read.json(
    'movielens_20m_movies.json.gz', 
)

ratings = sqlCtx.read.json(
    'movielens_20m_ratings.json.gz',
)

#We found the best tag set is in MovieLens 20M and it usable for all movielens
tags = sqlCtx.read.json('movielens_20m_tags.json.gz')

Set up all of the parameters necessary for the runner



In [ ]:

    
#name of the dataset: will be used for to get the correct vectorizer and when saving files
data_name = 'movielens_20m'
#the types of user vectors to assess
#each dataset has different user vectors that can be chosen
user_vector_types = ['ratings', 'pos_ratings', 'ratings_to_interact']
#the types of content vectors to assess
#each dataset has different content vectors that can be chosen
content_vector_types = ['genre','tags'] 

#the directory where intermediate files will be saved including user vectors, content vectors, and predictions
#this can be HDFS
directory = 'HDFS/movielens/data'
#the directory for the csv results files.
#this should not be HDFS
results_directory = 'movielens/results'

#the collaborative filtering algorithms to run
cf_predictions = ['cf_mllib', 'cf_item', 'cf_user']
#the content based algorithms to run
cb_predictions = ['cb_vect', 'cb_kmeans_100', 'cb_kmeans_1000']

#the number of predictions to give to a user
result_runs = [100, 1000] 

#any additional items that are necessary to run the content vectors
#for MovieLens this includes the user tags if you want to run the tag content vector
support_files = {'num_tags':300, 'tag_rdd':tags}

Pass in all the variables into the Hermes Runner



In [ ]:

    
runner = hermes_run_script.hermes_run(ratings, movies, user_vector_types, content_vector_types,\
    sqlCtx, sc, data_name, directory, results_directory, cf_predictions, cb_predictions, \
    result_runs, num_partitions=30, **support_files)



In [ ]:

    
#run the vectorizers
runner.run_vectorizer()



In [ ]:

    
#run the collaborative filtering algorithms
runner.run_cf_predictions()



In [ ]:

    
#run the content based algorithms
runner.run_cb_predictions()



In [ ]:

    
#get the results for the collaborative filtering predictions
runner.run_cf_results()



In [ ]:

    
#get the results for the content based predictions
runner.run_cb_results()



In [ ]:

    
#consolidate all of the results into a single csv file
runner.consolidate_results()

View the results



In [10]:

    
full_results_path = results_directory + data_name + '_full_results.csv'
results =  pd.read_csv(full_results_path, delimiter=',', index_col=0)



In [16]:

    
#View part or all of the results
results[['user_vector','content_vector','N','alg_type','serendipity', 'cat_coverage', 'rmse']]









    Out[16]:






  
    
      
      user_vector
      content_vector
      N
      alg_type
      serendipity
      cat_coverage
      rmse
    
  
  
    
      0 
               pos_ratings
       genre
       1000
              cb_vect
       0.127970
       28.304557
       0.579861
    
    
      1 
                   ratings
       genre
        100
        cb_kmeans_100
       0.229742
        2.698327
       1.300742
    
    
      2 
               pos_ratings
       genre
       1000
              cf_user
       0.047091
       28.304557
       0.474636
    
    
      3 
       ratings_to_interact
       genre
       1000
              cb_vect
       0.127515
       26.983270
       0.791903
    
    
      4 
               pos_ratings
       genre
        100
              cf_user
       0.018797
        2.830456
       0.454204
    
    
      5 
                   ratings
       genre
       1000
              cb_vect
       0.101428
       26.983270
       1.198814
    
    
      6 
                   ratings
       genre
        100
             cf_mllib
       0.075411
        2.698327
       0.757524
    
    
      7 
       ratings_to_interact
       genre
        100
        cb_kmeans_100
       0.274341
        2.698327
       0.820626
    
    
      8 
               pos_ratings
        tags
       1000
              cb_vect
       0.087713
       28.304557
       0.488214
    
    
      9 
       ratings_to_interact
       genre
       1000
              cf_user
       0.057605
       26.983270
       0.474519
    
    
      10
               pos_ratings
       genre
       1000
              cf_item
       0.350344
       28.304557
       0.471052
    
    
      11
       ratings_to_interact
       genre
       1000
             cf_mllib
       0.104021
       26.983270
       0.464547
    
    
      12
       ratings_to_interact
       genre
        100
              cb_vect
       0.218246
        2.698327
       0.684127
    
    
      13
                   ratings
       genre
       1000
              cb_vect
       0.103222
       26.983270
       1.199120
    
    
      14
       ratings_to_interact
       genre
        100
              cf_item
       0.286285
        2.698327
       0.386307
    
    
      15
       ratings_to_interact
        tags
        100
              cb_vect
       0.107325
        2.698327
       0.846901
    
    
      16
                   ratings
       genre
       1000
              cf_user
       0.032895
       26.983270
       0.885211
    
    
      17
       ratings_to_interact
       genre
       1000
              cf_item
       0.093958
       26.983270
       0.498258
    
    
      18
       ratings_to_interact
        tags
       1000
              cb_vect
       0.094904
       26.983270
       1.037724
    
    
      19
                   ratings
        tags
        100
        cb_kmeans_100
       0.168451
        2.671344
       1.489720
    
    
      20
               pos_ratings
        tags
        100
              cb_vect
       0.092884
        2.830456
       0.495794
    
    
      21
       ratings_to_interact
        tags
       1000
       cb_kmeans_1000
       0.114713
       26.713438
       1.147776
    
    
      22
               pos_ratings
       genre
        100
             cf_mllib
       0.093116
        2.830456
       0.513351
    
    
      23
                   ratings
        tags
       1000
              cb_vect
       0.071259
       26.983270
       1.356846
    
    
      24
               pos_ratings
       genre
        100
              cb_vect
       0.242384
        2.830456
       0.618196
    
    
      25
       ratings_to_interact
       genre
        100
             cf_mllib
       0.238649
        2.698327
       0.424146
    
    
      26
               pos_ratings
       genre
        100
        cb_kmeans_100
       0.247484
        2.802151
       0.576947
    
    
      27
                   ratings
        tags
       1000
       cb_kmeans_1000
       0.086146
       26.983270
       1.392828
    
    
      28
                   ratings
       genre
        100
       cb_kmeans_1000
       0.206245
        2.698327
       1.253268
    
    
      29
                   ratings
       genre
        100
              cb_vect
       0.202159
        2.698327
       1.264294
    
    
      30
                   ratings
       genre
       1000
       cb_kmeans_1000
       0.105056
       26.713438
       1.261918
    
    
      31
                   ratings
       genre
       1000
              cf_item
       0.150584
       26.983270
       1.003187
    
    
      32
                   ratings
       genre
        100
        cb_kmeans_100
       0.228856
        2.698327
       1.299784
    
    
      33
               pos_ratings
        tags
       1000
       cb_kmeans_1000
       0.106437
       28.304557
       0.481822
    
    
      34
                   ratings
       genre
        100
              cb_vect
       0.202552
        2.698327
       1.272136
    
    
      35
       ratings_to_interact
        tags
        100
        cb_kmeans_100
       0.178447
        2.671344
       1.100347
    
    
      36
       ratings_to_interact
       genre
        100
              cf_user
       0.111164
        2.698327
       0.294139
    
    
      37
                   ratings
       genre
        100
              cf_item
       0.406931
        2.698327
       1.493486
    
    
      38
                   ratings
       genre
        100
              cf_user
       0.011230
        2.698327
       0.841587
    
    
      39
                   ratings
       genre
       1000
             cf_mllib
       0.058938
       26.983270
       0.814773
    
    
      40
       ratings_to_interact
       genre
       1000
       cb_kmeans_1000
       0.140200
       26.983270
       0.931490
    
    
      41
               pos_ratings
       genre
       1000
             cf_mllib
       0.079068
       28.304557
       0.481792
    
    
      42
                   ratings
        tags
        100
              cb_vect
       0.086813
        2.698327
       1.209604
    
    
      43
               pos_ratings
       genre
        100
              cf_item
       0.357078
        2.830456
       0.539761
    
    
      44
               pos_ratings
       genre
       1000
       cb_kmeans_1000
       0.139120
       28.304557
       0.558985
    
    
      45
               pos_ratings
        tags
        100
        cb_kmeans_100
       0.201609
        2.802151
       0.483458



In [ ]:



In [ ]:

	user_vector	content_vector	N	alg_type	serendipity	cat_coverage	rmse
0	pos_ratings	genre	1000	cb_vect	0.127970	28.304557	0.579861
1	ratings	genre	100	cb_kmeans_100	0.229742	2.698327	1.300742
2	pos_ratings	genre	1000	cf_user	0.047091	28.304557	0.474636
3	ratings_to_interact	genre	1000	cb_vect	0.127515	26.983270	0.791903
4	pos_ratings	genre	100	cf_user	0.018797	2.830456	0.454204
5	ratings	genre	1000	cb_vect	0.101428	26.983270	1.198814
6	ratings	genre	100	cf_mllib	0.075411	2.698327	0.757524
7	ratings_to_interact	genre	100	cb_kmeans_100	0.274341	2.698327	0.820626
8	pos_ratings	tags	1000	cb_vect	0.087713	28.304557	0.488214
9	ratings_to_interact	genre	1000	cf_user	0.057605	26.983270	0.474519
10	pos_ratings	genre	1000	cf_item	0.350344	28.304557	0.471052
11	ratings_to_interact	genre	1000	cf_mllib	0.104021	26.983270	0.464547
12	ratings_to_interact	genre	100	cb_vect	0.218246	2.698327	0.684127
13	ratings	genre	1000	cb_vect	0.103222	26.983270	1.199120
14	ratings_to_interact	genre	100	cf_item	0.286285	2.698327	0.386307
15	ratings_to_interact	tags	100	cb_vect	0.107325	2.698327	0.846901
16	ratings	genre	1000	cf_user	0.032895	26.983270	0.885211
17	ratings_to_interact	genre	1000	cf_item	0.093958	26.983270	0.498258
18	ratings_to_interact	tags	1000	cb_vect	0.094904	26.983270	1.037724
19	ratings	tags	100	cb_kmeans_100	0.168451	2.671344	1.489720
20	pos_ratings	tags	100	cb_vect	0.092884	2.830456	0.495794
21	ratings_to_interact	tags	1000	cb_kmeans_1000	0.114713	26.713438	1.147776
22	pos_ratings	genre	100	cf_mllib	0.093116	2.830456	0.513351
23	ratings	tags	1000	cb_vect	0.071259	26.983270	1.356846
24	pos_ratings	genre	100	cb_vect	0.242384	2.830456	0.618196
25	ratings_to_interact	genre	100	cf_mllib	0.238649	2.698327	0.424146
26	pos_ratings	genre	100	cb_kmeans_100	0.247484	2.802151	0.576947
27	ratings	tags	1000	cb_kmeans_1000	0.086146	26.983270	1.392828
28	ratings	genre	100	cb_kmeans_1000	0.206245	2.698327	1.253268
29	ratings	genre	100	cb_vect	0.202159	2.698327	1.264294
30	ratings	genre	1000	cb_kmeans_1000	0.105056	26.713438	1.261918
31	ratings	genre	1000	cf_item	0.150584	26.983270	1.003187
32	ratings	genre	100	cb_kmeans_100	0.228856	2.698327	1.299784
33	pos_ratings	tags	1000	cb_kmeans_1000	0.106437	28.304557	0.481822
34	ratings	genre	100	cb_vect	0.202552	2.698327	1.272136
35	ratings_to_interact	tags	100	cb_kmeans_100	0.178447	2.671344	1.100347
36	ratings_to_interact	genre	100	cf_user	0.111164	2.698327	0.294139
37	ratings	genre	100	cf_item	0.406931	2.698327	1.493486
38	ratings	genre	100	cf_user	0.011230	2.698327	0.841587
39	ratings	genre	1000	cf_mllib	0.058938	26.983270	0.814773
40	ratings_to_interact	genre	1000	cb_kmeans_1000	0.140200	26.983270	0.931490
41	pos_ratings	genre	1000	cf_mllib	0.079068	28.304557	0.481792
42	ratings	tags	100	cb_vect	0.086813	2.698327	1.209604
43	pos_ratings	genre	100	cf_item	0.357078	2.830456	0.539761
44	pos_ratings	genre	1000	cb_kmeans_1000	0.139120	28.304557	0.558985
45	pos_ratings	tags	100	cb_kmeans_100	0.201609	2.802151	0.483458