In [ ]:
import sys
sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python')
from etl import ETLUtils
from etl.reviews_dataset_analyzer import ReviewsDatasetAnalyzer
# Load reviews
file_path = '/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json'
reviews = ETLUtils.load_json_file(file_path)
rda = ReviewsDatasetAnalyzer(reviews)
In [ ]:
# Number of reviews per user
users_summary = rda.summarize_reviews_by_field('user_id')
print('Average number of reviews per user', float(rda.num_reviews)/rda.num_users)
users_summary.plot(kind='line', rot=0)
In [ ]:
# Number of reviews per item
items_summary = rda.summarize_reviews_by_field('offering_id')
print('Average number of reviews per item', float(rda.num_reviews)/rda.num_items)
items_summary.plot(kind='line', rot=0)
In [ ]:
# Number of items 2 users have in common
common_item_counts = rda.count_items_in_common()
plt.plot(common_item_counts.keys(), common_item_counts.values())
In [ ]:
from pylab import boxplot
my_data = [key for key, value in common_item_counts.iteritems() for i in xrange(value)]
mean_common_items = float(sum(my_data))/len(my_data)
print('Average number of common items between two users:', mean_common_items)
boxplot(my_data)