Basic Data Ingestion, Aggregations, and Linear Models



In [1]:

    
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))









    



Attaching package: 'SparkR'

The following objects are masked from 'package:stats':

    cov, filter, lag, na.omit, predict, sd, var

The following objects are masked from 'package:base':

    colnames, colnames<-, intersect, rank, rbind, sample, subset,
    summary, table, transform

Initialize SparkContext, SQLContext, and HiveContext



In [2]:

    
sc <- sparkR.init(sparkJars="/usr/share/java/mysql-connector-java.jar", 
  sparkPackages="com.databricks:spark-csv_2.10:1.4.0")
sqlContext <- sparkRSQL.init(sc)
hiveContext <- sparkRHive.init(sc)









    



Launching java with spark-submit command /root/spark-1.6.1-bin-fluxcapacitor/bin/spark-submit --jars /usr/share/java/mysql-connector-java-5.1.28.jar --packages com.databricks:spark-csv_2.10:1.4.0 sparkr-shell /tmp/RtmpScARtp/backend_port4b7c16ad5660

Read Movie Ratings CSV



In [3]:

    
movieRatingsCsvDF <- read.df(sqlContext, 
  "/root/pipeline/datasets/movielens/ml-latest/ratings.csv", 
  "com.databricks.spark.csv", header="true") 
head(movieRatingsCsvDF)









    Out[3]:





userId movieId rating timestamp

	1 1 50 4.0 1329753504
	2 1 296 4.0 1329753602
	3 1 318 4.5 1329753494
	4 1 527 4.5 1329753507
	5 1 541 3.0 1329753607
	6 1 608 4.0 1329753638

Read Movie Ratings From Hive



In [4]:

    
movieRatingsHiveDF <- sql(hiveContext, "SELECT * FROM movie_ratings")
head(results)









    



Error in head(results): object 'results' not found

Show Only Ratings == 5



In [5]:

    
head(filter(movieRatingsHiveDF, movieRatingsHiveDF$rating == 5))









    Out[5]:





userId movieId rating timestamp

	1 1 3897 5 1329753716
	2 1 4783 5 1329754027
	3 1 4979 5 1329753751
	4 1 4995 5 1329753888
	5 1 6380 5 1329753988
	6 1 7361 5 1329753448

Aggregate and Count By UserId



In [6]:

    
userIdCounts <- 
  summarize(groupBy(movieRatingsHiveDF, movieRatingsHiveDF$userId), 
  count = n(movieRatingsHiveDF$userId))
head(arrange(userIdCounts, desc(userIdCounts$count)))









    Out[6]:





userId count

	1 92302 9269
	2 142788 7515
	3 8452 6779
	4 165005 5679
	5 221498 5644
	6 216632 5601

Train Linear Regression Model



In [7]:

    
linearRegressionModel <- glm(rating ~ userId + movieId, 
  data = movieRatingsHiveDF, family = "gaussian")

Predict Using Trained Linear Regression Model



In [8]:

    
predictionsDF <- predict(linearRegressionModel, movieRatingsHiveDF)

Calculate Errors



In [9]:

    
errorsDF <- select(
    predictionsDF, predictionsDF$label, predictionsDF$prediction, 
    predictionsDF$userId, predictionsDF$movieId, 
    alias(predictionsDF$label - predictionsDF$prediction, "error"))
head(errorsDF)









    Out[9]:





label prediction userId movieId error

	1 4 3.522174 1 50 0.4778256
	2 4 3.522209 1 296 0.4777909
	3 4.5 3.522212 1 318 0.9777878
	4 4.5 3.522242 1 527 0.9777584
	5 3 3.522244 1 541 -0.5222436
	6 4 3.522253 1 608 0.4777469

	userId	movieId	rating	timestamp
1	1	50	4.0	1329753504
2	1	296	4.0	1329753602
3	1	318	4.5	1329753494
4	1	527	4.5	1329753507
5	1	541	3.0	1329753607
6	1	608	4.0	1329753638

	userId	movieId	rating	timestamp
1	1	3897	5	1329753716
2	1	4783	5	1329754027
3	1	4979	5	1329753751
4	1	4995	5	1329753888
5	1	6380	5	1329753988
6	1	7361	5	1329753448

	userId	count
1	92302	9269
2	142788	7515
3	8452	6779
4	165005	5679
5	221498	5644
6	216632	5601

	label	prediction	userId	movieId	error
1	4	3.522174	1	50	0.4778256
2	4	3.522209	1	296	0.4777909
3	4.5	3.522212	1	318	0.9777878
4	4.5	3.522242	1	527	0.9777584
5	3	3.522244	1	541	-0.5222436
6	4	3.522253	1	608	0.4777469