In [1]:
##############################################################################
#
# Workshop: How to develop a personalised machine learning-based application
#
# Notebook 2: Classification
#
##############################################################################
In [2]:
# jupyter notebook instructions:
# - Every cell can be executed seperately from the rest.
# - You can execute cells in a non-sequential order (but be carefull of
# the dependencies between them).
# - Execute a cell by pressing the play button or Shift+Enter.
In [8]:
# Import necessary modules
import pandas as pd
import numpy as np
import json
from scipy.stats.stats import pearsonr
In [3]:
# Load the data from the csv file to a pandas dataframe
books_df = pd.read_csv("../data/data_books.csv", sep=",", header=None,
names=['Reviewer', 'Book', 'Rating'])
print("Ingested %d rows" % len(books_df))
In [4]:
# Let's have a look inside the dataframe
books_df.head()
Out[4]:
In [5]:
# Find the Top-20 most reviewed books
top20_books = pd.value_counts(books_df.Book)
top20_books.head(20)
Out[5]:
In [6]:
# And the Top 20 reviewers
top20_reviewers = pd.value_counts(books_df.Reviewer)
top20_reviewers.head(20)
Out[6]:
In [16]:
# Select 2 books as our sample
book1 = "The Lovely Bones: A Novel"
book2 = "Wild Animus"
book3 = "The Da Vinci Code"
# Retrieve all reviewers
book1_reviewers = books_df[books_df.Book == book1].Reviewer
book2_reviewers = books_df[books_df.Book == book2].Reviewer
book3_reviewers = books_df[books_df.Book == book3].Reviewer
In [23]:
def get_book_reviews(title, common_reviewers):
"Collect the reviews of the common reviewers"
mask = (books_df.Reviewer.isin(common_reviewers)) & (books_df.Book==title)
reviews = books_df[mask].sort_values('Reviewer')
reviews = reviews[reviews.Reviewer.duplicated()==False]
return reviews
def calculate_correlation(book1, book2):
# We start by finding the common reviewers
book1_reviewers = books_df[books_df.Book == book1].Reviewer
book2_reviewers = books_df[books_df.Book == book2].Reviewer
common_reviewers = set(book1_reviewers).intersection(book2_reviewers)
# Then we look for the reviews given by common reviewers
book1_reviews = get_book_reviews(book1, common_reviewers)
book2_reviews = get_book_reviews(book2, common_reviewers)
# Calculate the Pearson Correlation Score
return pearsonr(book1_reviews.Rating, book2_reviews.Rating)[0]
# Print the correlation score between the 3 books
print("Corr of " + str(book1) + " and " + str(book2) + ": " + str(calculate_correlation(book1,book2)))
print("Corr of " + str(book1) + " and " + str(book3) + ": " + str(calculate_correlation(book1,book3)))
print("Corr of " + str(book2) + " and " + str(book2) + ": " + str(calculate_correlation(book2,book3)))
In [24]:
# ????????????????
# There is negative or very low positive correlation between the 3 most
# reviewed books. We better try something that looks more similar.
# For instance, Harry Potter
In [25]:
# Select 2 books as our sample
book4 = "Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))"
book5 = "Harry Potter and the Chamber of Secrets (Book 2)"
book6 = "Harry Potter and the Prisoner of Azkaban (Book 3)"
# Retrieve all reviewers
book4_reviewers = books_df[books_df.Book == book4].Reviewer
book5_reviewers = books_df[books_df.Book == book5].Reviewer
book6_reviewers = books_df[books_df.Book == book6].Reviewer
print("Corr of " + str(book4) + " and " + str(book5) + ": " + str(calculate_correlation(book4,book5)))
print("Corr of " + str(book4) + " and " + str(book6) + ": " + str(calculate_correlation(book4,book6)))
print("Corr of " + str(book5) + " and " + str(book6) + ": " + str(calculate_correlation(book5,book6)))
In [ ]:
# Aha!
# The higher the correlation, the most likely the reviewers of
# one book will like another book with high correlation.
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: