In [ ]:
#This block of code will set up a spark content and sql context if you are running locally
#If you are on cluster or have deployed spark a different way you don't need this
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
try:
sc = SparkContext()
except:
sc = SparkContext._active_spark_context
sqlCtx = SQLContext(sc)
In [ ]:
sc.addPyFile('hermes/hermes.zip')
from src import hermes_run_script
import pandas as pd
In [ ]:
movies = sqlCtx.read.json(
'movielens_20m_movies.json.gz',
)
ratings = sqlCtx.read.json(
'movielens_20m_ratings.json.gz',
)
#We found the best tag set is in MovieLens 20M and it usable for all movielens
tags = sqlCtx.read.json('movielens_20m_tags.json.gz')
In [ ]:
#name of the dataset: will be used for to get the correct vectorizer and when saving files
data_name = 'movielens_20m'
#the types of user vectors to assess
#each dataset has different user vectors that can be chosen
user_vector_types = ['ratings', 'pos_ratings', 'ratings_to_interact']
#the types of content vectors to assess
#each dataset has different content vectors that can be chosen
content_vector_types = ['genre','tags']
#the directory where intermediate files will be saved including user vectors, content vectors, and predictions
#this can be HDFS
directory = 'HDFS/movielens/data'
#the directory for the csv results files.
#this should not be HDFS
results_directory = 'movielens/results'
#the collaborative filtering algorithms to run
cf_predictions = ['cf_mllib', 'cf_item', 'cf_user']
#the content based algorithms to run
cb_predictions = ['cb_vect', 'cb_kmeans_100', 'cb_kmeans_1000']
#the number of predictions to give to a user
result_runs = [100, 1000]
#any additional items that are necessary to run the content vectors
#for MovieLens this includes the user tags if you want to run the tag content vector
support_files = {'num_tags':300, 'tag_rdd':tags}
In [ ]:
runner = hermes_run_script.hermes_run(ratings, movies, user_vector_types, content_vector_types,\
sqlCtx, sc, data_name, directory, results_directory, cf_predictions, cb_predictions, \
result_runs, num_partitions=30, **support_files)
In [ ]:
#run the vectorizers
runner.run_vectorizer()
In [ ]:
#run the collaborative filtering algorithms
runner.run_cf_predictions()
In [ ]:
#run the content based algorithms
runner.run_cb_predictions()
In [ ]:
#get the results for the collaborative filtering predictions
runner.run_cf_results()
In [ ]:
#get the results for the content based predictions
runner.run_cb_results()
In [ ]:
#consolidate all of the results into a single csv file
runner.consolidate_results()
In [10]:
full_results_path = results_directory + data_name + '_full_results.csv'
results = pd.read_csv(full_results_path, delimiter=',', index_col=0)
In [16]:
#View part or all of the results
results[['user_vector','content_vector','N','alg_type','serendipity', 'cat_coverage', 'rmse']]
Out[16]:
In [ ]:
In [ ]: