User trustedness

Loading data


In [9]:
# all_reviews = (spark
#     .read
#     .json('../../data/raw_data/reviews_Musical_Instruments_5.json.gz'))

Extracting ranking components


In [10]:
reviews = all_reviews
reviews_per_reviewer = reviews.groupBy('reviewerID').count()

In [31]:
from pyspark.sql.functions import col, udf, avg
from pyspark.sql.types import DoubleType

helpfulness_ratio = udf(
    lambda (useful, out_of): useful / float(out_of + 1), 
    returnType=DoubleType())

helpfulness = (reviews
  .select('reviewerID', helpfulness_ratio(col('helpful')).alias('helpfulness'))
  .groupBy('reviewerID')
  .agg(avg(col('helpfulness')).alias('helpfulness')))

Computing rankings & visualizing the good and bad reviews from the most trusted users


In [32]:
reviewers_trustedness = (helpfulness
    .join(reviews_per_reviewer, 'reviewerID')
    .select('reviewerID', (col('helpfulness') * col('count')).alias('trustedness')))

In [ ]:
reviewers_trustedness.limit(10).toPandas()