In [1]:
##############################################################################
#
# Workshop: How to develop a personalised machine learning-based application
#
# Notebook 2: Classification
#
##############################################################################

In [2]:
# jupyter notebook instructions:
# - Every cell can be executed seperately from the rest.
# - You can execute cells in a non-sequential order (but be carefull of 
#   the dependencies between them).
# - Execute a cell by pressing the play button or Shift+Enter.

In [8]:
# Import necessary modules
import pandas as pd
import numpy as np
import json
from scipy.stats.stats import pearsonr

In [3]:
# Load the data from the csv file to a pandas dataframe
books_df = pd.read_csv("../data/data_books.csv", sep=",", header=None,
                      names=['Reviewer', 'Book', 'Rating'])

print("Ingested %d rows" % len(books_df))


Ingested 383852 rows

In [4]:
# Let's have a look inside the dataframe
books_df.head()


Out[4]:
Reviewer Book Rating
0 276726 Rites of Passage 5
1 276729 Help!: Level 1 3
2 276729 The Amsterdam Connection : Level 4 (Cambridge ... 6
3 276744 A Painted House 7
4 276747 Little Altars Everywhere 9

In [5]:
# Find the Top-20 most reviewed books
top20_books = pd.value_counts(books_df.Book)
top20_books.head(20)


Out[5]:
The Lovely Bones: A Novel                                           707
Wild Animus                                                         581
The Da Vinci Code                                                   494
The Secret Life of Bees                                             406
The Nanny Diaries: A Novel                                          393
The Red Tent (Bestselling Backlist)                                 383
Bridget Jones's Diary                                               377
A Painted House                                                     366
Life of Pi                                                          336
Harry Potter and the Chamber of Secrets (Book 2)                    326
Divine Secrets of the Ya-Ya Sisterhood: A Novel                     323
Angels &amp                                                         317
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))    315
The Summons                                                         309
Where the Heart Is (Oprah's Book Club (Paperback))                  295
The Notebook                                                        293
Girl with a Pearl Earring                                           278
Harry Potter and the Prisoner of Azkaban (Book 3)                   277
Snow Falling on Cedars                                              275
The Pilot's Wife : A Novel                                          272
Name: Book, dtype: int64

In [6]:
# And the Top 20 reviewers
top20_reviewers = pd.value_counts(books_df.Reviewer)
top20_reviewers.head(20)


Out[6]:
11676     6943
98391     5691
189835    1899
153662    1845
23902     1180
235105    1020
76499     1012
171118     962
16795      959
248718     941
56399      838
197659     781
35859      777
185233     698
95359      606
114368     603
158295     567
101851     563
177458     524
204864     504
Name: Reviewer, dtype: int64

In [16]:
# Select 2 books as our sample
book1 = "The Lovely Bones: A Novel"
book2 = "Wild Animus"
book3 = "The Da Vinci Code"

# Retrieve all reviewers
book1_reviewers = books_df[books_df.Book == book1].Reviewer
book2_reviewers = books_df[books_df.Book == book2].Reviewer
book3_reviewers = books_df[books_df.Book == book3].Reviewer

In [23]:
def get_book_reviews(title, common_reviewers):
    "Collect the reviews of the common reviewers"
    mask = (books_df.Reviewer.isin(common_reviewers)) & (books_df.Book==title)
    reviews = books_df[mask].sort_values('Reviewer')
    reviews = reviews[reviews.Reviewer.duplicated()==False]
    return reviews


def calculate_correlation(book1, book2):
    # We start by finding the common reviewers
    book1_reviewers = books_df[books_df.Book == book1].Reviewer
    book2_reviewers = books_df[books_df.Book == book2].Reviewer
    common_reviewers = set(book1_reviewers).intersection(book2_reviewers)

    # Then we look for the reviews given by common reviewers
    book1_reviews = get_book_reviews(book1, common_reviewers)
    book2_reviews = get_book_reviews(book2, common_reviewers)
    
    # Calculate the Pearson Correlation Score
    return pearsonr(book1_reviews.Rating, book2_reviews.Rating)[0]

# Print the correlation score between the 3 books
print("Corr of " + str(book1) + " and "  + str(book2) + ": " + str(calculate_correlation(book1,book2)))
print("Corr of " + str(book1) + " and "  + str(book3) + ": " + str(calculate_correlation(book1,book3)))
print("Corr of " + str(book2) + " and "  + str(book2) + ": " + str(calculate_correlation(book2,book3)))


Corr of The Lovely Bones: A Novel and Wild Animus: -0.288043667368
Corr of The Lovely Bones: A Novel and The Da Vinci Code: 0.0171094811196
Corr of Wild Animus and Wild Animus: -0.580228328508

In [24]:
# ????????????????
# There is negative or very low positive correlation between the 3 most
# reviewed books. We better try something that looks more similar.
# For instance, Harry Potter

In [25]:
# Select 2 books as our sample
book4 = "Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))"
book5 = "Harry Potter and the Chamber of Secrets (Book 2)"
book6 = "Harry Potter and the Prisoner of Azkaban (Book 3)"

# Retrieve all reviewers
book4_reviewers = books_df[books_df.Book == book4].Reviewer
book5_reviewers = books_df[books_df.Book == book5].Reviewer
book6_reviewers = books_df[books_df.Book == book6].Reviewer

print("Corr of " + str(book4) + " and "  + str(book5) + ": " + str(calculate_correlation(book4,book5)))
print("Corr of " + str(book4) + " and "  + str(book6) + ": " + str(calculate_correlation(book4,book6)))
print("Corr of " + str(book5) + " and "  + str(book6) + ": " + str(calculate_correlation(book5,book6)))


Corr of Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)) and Harry Potter and the Chamber of Secrets (Book 2): 0.869647366453
Corr of Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)) and Harry Potter and the Prisoner of Azkaban (Book 3): 0.347525189982
Corr of Harry Potter and the Chamber of Secrets (Book 2) and Harry Potter and the Prisoner of Azkaban (Book 3): 0.614027793363

In [ ]:
# Aha!
# The higher the correlation, the most likely the reviewers of
# one book will like another book with high correlation.

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: