this example is a EclairJS (JavaScript) implementation of movie recommending
In [1]:
var SparkContext = require('eclairjs/SparkContext');
var sc = new SparkContext("local[*]", "myapp");
var Tuple = require('eclairjs/Tuple');
var List = require('eclairjs/List');
var ALS = require('eclairjs/mllib/recommendation/ALS');
var Rating = require('eclairjs/mllib/recommendation/Rating');
In [2]:
var small_ratings_raw_data = sc.textFile('../data/mllib/ml-latest-small/ratings.csv');
var small_ratings_raw_data_header = small_ratings_raw_data.take(1)[0];
var small_ratings_data = small_ratings_raw_data.filter(function(line, small_ratings_raw_data_header) {
// filters out the header
return line != small_ratings_raw_data_header;
}, [small_ratings_raw_data_header])
.map(function(line) {
return line.split(",");
})
.map(function(tokens, Rating) {
// tokenes are userId,movieId,rating,timestamp
return new Rating(tokens[0],tokens[1],tokens[2]);
}, [Rating]).cache()
JSON.stringify(small_ratings_data.take(3));
Out[2]:
In [3]:
var small_movies_raw_data = sc.textFile('../data/mllib/ml-latest-small/movies.csv');
var small_movies_raw_data_header = small_movies_raw_data.take(1)[0];
var small_movies_data = small_movies_raw_data.filter(function(line, small_movies_raw_data_header) {
// filters out the header
return line != small_movies_raw_data_header;
}, [small_movies_raw_data_header])
.map(function(line, Tuple) {
var fields = line.split(",");
return new Tuple(parseInt(fields[0]), fields[1]);
}, [Tuple]).cache();
var small_movies_titles = small_movies_data.mapToPair(
function( tuple2, Tuple) { // Tuple2
return new Tuple(tuple2[0], tuple2[1]);
}, [Tuple]);
JSON.stringify(small_movies_data.take(3));
Out[3]:
In [4]:
var seed = 0;
var split = small_ratings_data.randomSplit([0.6, 0.2, 0.2], seed)
var training_RDD = split[0];
var validation_RDD = split[1];
var test_RDD = split[2];
var validation_for_predict_RDD = validation_RDD.map(function(rating, Tuple) {
return new Tuple(rating.user(), rating.product());
}, [Tuple]);
JSON.stringify(validation_for_predict_RDD.take(3));
Out[4]:
In [5]:
var test_for_predict_RDD = test_RDD.map(function(rating, Tuple) {
return new Tuple(rating.user(), rating.product());
}, [Tuple]);
JSON.stringify(test_for_predict_RDD.take(3));
Out[5]:
In [6]:
seed = 5
var iterations = 10
var regularization_parameter = 0.1
var ranks = [4, 8, 12];
var errors = [0, 0, 0];
var err = 0
var tolerance = 0.02
var min_error = Number.POSITIVE_INFINITY
var best_rank = -1
var best_iteration = -1
var blocks = -1;
var lambda = regularization_parameter;
ranks.forEach(function(rank) {
var model = ALS.train(training_RDD, rank, iterations, regularization_parameter, blocks, seed);
var predictions = model.predict(validation_for_predict_RDD)
.mapToPair(function(rating, Tuple) {
return new Tuple(new Tuple(rating.user(), rating.product()), rating.rating());
}, [Tuple]
);
var rates_and_preds = validation_RDD.mapToPair( function(rating, Tuple) {
return new Tuple(new Tuple(rating.user(), rating.product()), rating.rating());
}, [Tuple])
.join(predictions);
var t = rates_and_preds.mapToFloat(function(tuple) {
// Rating object
var x1 = tuple[0]; // Tuple2
var x2 = tuple[1]; // Tuple2
var a = x2[0];
var y =x2[0] - x2[1]; // ([1][0] - r[1][1])**2
return Math.pow(y, 2);
});
var error = Math.sqrt(t.mean());
errors[err] = error;
err += 1;
if (error < min_error) {
min_error = error;
best_rank = rank;
}
});
"The best model was trained with rank " +best_rank;
Out[6]:
In [7]:
var complete_ratings_raw_data =
sc.textFile("../data/mllib/ml-latest-small/ratings.csv");
var complete_ratings_raw_data_header = complete_ratings_raw_data.take(1)[0];
var complete_ratings_data = complete_ratings_raw_data.filter(function (line, complete_ratings_raw_data_header) {
return line != complete_ratings_raw_data_header;
}, [complete_ratings_raw_data_header])
.map(function( line, Rating) {
var fields = line.split(",");
var userId = parseInt(fields[0]);
var movieId = parseInt(fields[1]);
var rating = parseFloat(fields[2]);
return new Rating(userId, movieId, rating);
}, [Rating])
.cache();
JSON.stringify("There are recommendations in the complete dataset: " + complete_ratings_data.count());
Out[7]:
In [8]:
var splits2 = complete_ratings_data.randomSplit([0.7, 0.3], 0);
training_RDD = splits2[0];
test_RDD = splits2[1];
var complete_model = ALS.train(training_RDD, best_rank, iterations, regularization_parameter, blocks, seed);
test_for_predict_RDD = test_RDD.map(function (rating, Tuple) {
return new Tuple(rating.user(), rating.product());
}, [Tuple]);
var predictions = complete_model.predict(test_for_predict_RDD)
.mapToPair(function( rating, Tuple) {
return new Tuple(new Tuple(rating.user(), rating.product()), rating.rating());
}, [Tuple]);
var rates_and_preds = test_RDD.mapToPair(function( rating, Tuple) {
return new Tuple(new Tuple(rating.user(), rating.product()), rating.rating());
}, [Tuple])
.join(predictions);
var t = rates_and_preds.mapToFloat( function( x) {
// Rating object
/*System.out.println("x._1 " + x._1);
System.out.println("x._2 " + x._2);*/
var x1 = x[0];
var x2 = x[1];
var a = x2[0];
var y = x2[0] - x2[1]; // ([1][0] - r[1][1])**2
//return new Tuple2(x.user(), x.product());
return Math.pow(y, 2);
});
var error = Math.sqrt(t.mean());
JSON.stringify("For testing data the RMSE is " + error);
Out[8]:
In [9]:
var complete_movies_raw_data =
sc.textFile("../data/mllib/ml-latest-small/movies.csv");
var complete_movies_raw_data_header = complete_movies_raw_data.take(1)[0];
var complete_movies_data = complete_movies_raw_data.filter(function(line, complete_movies_raw_data_header) {
// filters out the header
return line != complete_movies_raw_data_header;
}, [complete_movies_raw_data_header])
.map(function(line, Tuple) {
var fields = line.split(",");
return new Tuple(parseInt(fields[0]), fields[1]);
}, [Tuple]).cache();
var complete_movies_titles = complete_movies_data.mapToPair(
function( tuple2, Tuple) { // Tuple2
return new Tuple(tuple2[0], tuple2[1]);
}, [Tuple]);
JSON.stringify("There are movies in the complete dataset " + complete_movies_titles.count());
Out[9]:
In [10]:
var movie_ID_with_ratings_RDD = complete_ratings_data.mapToPair(function( rating, Tuple) {
return new Tuple(rating.product(), rating.rating());
}, [Tuple])
.groupByKey();
var movie_ID_with_avg_ratings_RDD = movie_ID_with_ratings_RDD.mapToPair(function( ID_and_ratings_tuple, Tuple) {
var w = ID_and_ratings_tuple[1];
var count = 0;
var sum = 0;
for (var i = 0; i < w.length; i++) {
var r = w[i];
sum += r;
count++;
}
var avgRating = sum / count;
return new Tuple(ID_and_ratings_tuple[0], new Tuple(count, avgRating));
}, [Tuple]);
var movie_rating_counts_RDD = movie_ID_with_avg_ratings_RDD.mapToPair(function( ID_with_avg_ratings, Tuple) {
var x = ID_with_avg_ratings;
var coutAvg = ID_with_avg_ratings[1];
return new Tuple(ID_with_avg_ratings[0], coutAvg[0]); // movieID, rating count
}, [Tuple]);
JSON.stringify("movie_ID_with_avg_ratings_RDD " + movie_ID_with_avg_ratings_RDD.take(10));
Out[10]:
In [11]:
var new_user_ID = 0;
// The format of each line is (userID, movieID, rating)
var new_user_ratings = [
new Rating(0, 260, 9), // Star Wars (1977)
new Rating(0, 1, 8), // Toy Story (1995)
new Rating(0, 16, 7), // Casino (1995)
new Rating(0, 25, 8), // Leaving Las Vegas (1995)
new Rating(0, 32, 9), // Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
new Rating(0, 335, 4), // Flintstones, The (1994)
new Rating(0, 379, 3), // Timecop (1994)
new Rating(0, 296, 7), // Pulp Fiction (1994)
new Rating(0, 858, 10), // Godfather, The (1972)
new Rating(0, 50, 8) // Usual Suspects, The (1995)
];
var new_user_ratings_RDD = sc.parallelize(new_user_ratings);
JSON.stringify("New user ratings: " + new_user_ratings_RDD.take(10));
Out[11]:
In [12]:
var complete_data_with_new_ratings_RDD = complete_ratings_data.union(new_user_ratings_RDD);
var new_ratings_model =
ALS.train(complete_data_with_new_ratings_RDD, best_rank,
iterations, regularization_parameter, blocks, seed);
/*
Let's now get some recommendations
*/
// get just movie IDs
var new_user_ratings_ids = [];
for (var i = 0; i < new_user_ratings.length; i++) {
new_user_ratings_ids.push(new_user_ratings[i].product());
}
// keep just those not on the ID list
var new_user_unrated_movies_RDD = complete_movies_data.filter(function( tuple, new_user_ratings_ids) {
if (new_user_ratings_ids.indexOf(tuple[0]) < 0) {
return true;
} else {
return false;
}
}, [new_user_ratings_ids])
.map(function( tuple, new_user_ID, Tuple) {
return new Tuple(new_user_ID, tuple[0]);
}, [new_user_ID, Tuple]);
// Use the input RDD, new_user_unrated_movies_RDD,
//with new_ratings_model.predictAll() to predict new ratings for the movies
var new_user_recommendations_RDD = new_ratings_model.predict(new_user_unrated_movies_RDD);
// Transform new_user_recommendations_RDD into pairs of the form (Movie ID, Predicted Rating)
var new_user_recommendations_rating_RDD = new_user_recommendations_RDD.mapToPair( function( rating, Tuple) {
return new Tuple(rating.product(), rating.rating());
}, [Tuple]);
var aRDD = new_user_recommendations_rating_RDD.join(complete_movies_titles);
var new_user_recommendations_rating_title_and_count_RDD =
/*new_user_recommendations_rating_RDD.join(complete_movies_titles)*/ aRDD.join(movie_rating_counts_RDD);
"new_user_recommendations_rating_title_and_count_RDD " + new_user_recommendations_rating_title_and_count_RDD.count();
Out[12]:
In [13]:
var new_user_recommendations_rating_title_and_count_RDD2 =
new_user_recommendations_rating_title_and_count_RDD.map(function( t, Tuple) {
// ( 27456,( (7.553736917670094,Shackleton's Antarctic Adventure (2001) ),1) )
// a = (27456, b)
var a = /*(Tuple2)*/ t;
// b = ( c ,1)
var b = /*(Tuple2)*/ a[1];
// c = (7.553736917670094,Shackleton's Antarctic Adventure (2001) )
var c = /*(Tuple2)*/ b[0];
var x = new Tuple(c[1], c[0], b[1]);
return x;
}, [Tuple]);
JSON.stringify("new_user_recommendations_rating_title_and_count_RDD2" +
new_user_recommendations_rating_title_and_count_RDD2.take(3));
Out[13]:
In [14]:
var new_user_recommendations_rating_title_and_count_RDD2_filtered =
new_user_recommendations_rating_title_and_count_RDD2.filter(function( tuple3) {
if (tuple3[2] < 25) {
return false;
} else {
return true;
}
});
/*
list top 25
*/
var top_movies = new_user_recommendations_rating_title_and_count_RDD2_filtered.takeOrdered(25,
function(tuple3_a, tuple3_b){
var aRate = tuple3_a[1];
var bRate = tuple3_b[1];
return aRate > bRate ? -1 : aRate == bRate? 0 : 1;
});
var str = "TOP recommended movies (with more than 25 reviews):\n\n";
for (var i = 0; i < top_movies.length; i++) {
str += top_movies[i][0] + " average rating " + top_movies[i][1] + " number of ratings " + top_movies[i][2] + "\n";
}
Out[14]:
In [17]:
var my_movie = sc.parallelizePairs([new Tuple(0, 500)]); // Quiz Show (1994)
var individual_movie_rating_RDD = new_ratings_model.predict(my_movie);
"Predicted rating for movie " + individual_movie_rating_RDD.take(1);
Out[17]:
In [ ]: