We start the second task in the capstone project with standard step of Data Science: data preparation. Particularly we are going to load JSON file as Apache Spark DataFrame (https://spark.apache.org/docs/latest/sql-programming-guide.html) and extract set of all cuisines that are presented in 'categories' attribute.
In [2]:
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import BooleanType
basePath = 'dataminingcapstone-001'
workingDir = os.path.join(os.curdir, basePath)
sqlContext = SQLContext(sc)
targetDir = os.path.join(workingDir, 'yelp_dataset_challenge_academic_dataset')
businessJSON = os.path.join(targetDir, 'yelp_academic_dataset_business.json')
businessDF = sqlContext.read.json(businessJSON)
reviewsJSON = os.path.join(targetDir, 'yelp_academic_dataset_review.json')
reviewsDF = sqlContext.read.json(reviewsJSON)
contains = udf(lambda xs, val: val in xs, BooleanType())
restaurantsDF = businessDF[contains(businessDF.categories, lit("Restaurants"))]
allSubcategories = set(restaurantsDF.select("categories").flatMap(lambda x: x.categories).distinct().collect())
allSubcategories.remove(u'Restaurants')
Now we collected all distinct values presented in categories of restaurant businesses. Let's exam what the set contains.
In [5]:
print allSubcategories
We can notice that categories contain not only cuisines, but also not related to restaurants categories, for example, there are categories 'Dry Cleaning & Laundry' or 'Art Galleries'. We will filter out these categories using a guess that real cuisines should have many businesses associated to this category.
In [6]:
countBySubcategories = restaurantsDF.select("categories").flatMap(lambda x: x.categories).countByValue()
countBySubcategories.pop(u'Restaurants', None)
Out[6]:
We can see that the hypothesis is true, for instance, the cuisine u'Mexican' is presented in 1749 business units while Kids Activities only in one. In order to determinate the threshold we're going to visualize the distribution using a bar chart.
In [7]:
%matplotlib inline
import operator
import numpy as np
import matplotlib.pyplot as plt
sortedSubcategories = sorted(countBySubcategories.items(), key=operator.itemgetter(1), reverse=True)
counts = [c for (cat, c) in sortedSubcategories]
categories = [cat for (cat, c) in sortedSubcategories]
length = len(counts)
width = 0.5
fig, ax = plt.subplots()
ind = np.arange(length)
bar = ax.bar(ind, counts, width, color='r')
ax.set_title('Count of subcategories')
ax.set_ylabel('count of businesses')
ax.set_xlabel('id of businesses')
fig.set_size_inches(18.5, 10.5)
fig.savefig('all_reviews_bars.png', dpi=200)
plt.show()
We can filter out unnecessary categories selecting sequence of categories with of 98% of all area.
In [8]:
target = 0.98
area = float(sum(counts))
cur_sum = float(counts[0])
threshhold = None
i = 0
while cur_sum / area < target:
cur_sum += float(counts[i])
i += 1
print 'select first %d categories' % i
In [9]:
cuisines = categories[:56]
print cuisines
Now everything is ready to split all restaurant reviews by found cuisines.
In [29]:
def combineReviewsByCategory(category):
businessesByCategory = businessDF[contains(businessDF.categories, lit(category))]
selectedReviewsDF = reviewsDF.join(businessesByCategory,\
businessesByCategory.business_id == reviewsDF.business_id)
return selectedReviewsDF.select("text").map(lambda x: x.text).reduce(lambda a, b: a + b)
In [30]:
reviewsByCategories = [combineReviewsByCategory(c) for c in cuisines]
In [10]:
def calcReviewsCount(category):
businessesByCategory = businessDF[contains(businessDF.categories, lit(category))]
selectedReviewsDF = reviewsDF.join(businessesByCategory,\
businessesByCategory.business_id == reviewsDF.business_id)
return selectedReviewsDF.select("text").map(lambda x: 1).reduce(lambda a, b: a + b)
reviewsCountByCategories = [calcReviewsCount(c) for c in cuisines]
Let's analyse how many reviews contains each category and display it as bar chart.
In [11]:
print reviewsCountByCategories
In [17]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
length = len(reviewsCountByCategories)
fig, ax = plt.subplots()
ind = np.arange(length)
bar = ax.bar(ind, reviewsCountByCategories, color='r')
ax.set_title('Count of reviews for each category')
ax.set_ylabel('count of reviews')
ax.set_xlabel('category name')
plt.xticks(ind + 0.5, categories, rotation='vertical')
fig.set_size_inches(18.5, 10.5)
fig.savefig('reviews_count_per_category.png', dpi=200)
plt.show()
In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
def buildTF_IDF(corpus, use_idf=True, max_df=1.0, min_df=1, tokenizer=None):
vectorizer = TfidfVectorizer(stop_words='english',
use_idf=use_idf, max_df=max_df, min_df=min_df, tokenizer=tokenizer)
return vectorizer.fit_transform(corpus)
In [83]:
tf_matrix = buildTF_IDF(reviewsByCategories, use_idf=False)
Then we're going to construct a similariy matrix using cosine similarity metric.
In [14]:
from sklearn.metrics.pairwise import cosine_similarity
def cosine_matrix(feature_matrix):
N = feature_matrix.shape[0]
res = cosine_similarity(feature_matrix[0:1], feature_matrix)
for i in range(1, N):
res = np.vstack((res, cosine_similarity(feature_matrix[i:i+1], feature_matrix)))
return res
In [84]:
tf_cosine_matrix = cosine_matrix(tf_matrix)
print tf_cosine_matrix
In [19]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import cm
def showSimilarityMatrix(similarity_matrix, title, file_name):
fig, ax = plt.subplots()
cax = ax.matshow(similarity_matrix, interpolation='nearest', cmap=cm.coolwarm)
ax.set_title(title)
ax.xaxis.tick_bottom()
ax.set_xticks(np.arange(similarity_matrix.shape[1]), minor=False)
ax.set_yticks(np.arange(similarity_matrix.shape[0]), minor=False)
ax.set_xticklabels(cuisines, rotation='vertical')
ax.set_yticklabels(cuisines)
cbar = fig.colorbar(cax)
fig.set_size_inches(23.5, 12.5)
fig.savefig(file_name, dpi=200)
plt.show()
In [255]:
showSimilarityMatrix(tf_cosine_matrix, 'Cosine matrix for no IDF', 'tf_cosine_matrix.png')
The result we've got is not so good, many categories appear to be quite similar based on the current similarity matrix. Further we're going to make steps to improve the result.
In [87]:
tf_idf_matrix = buildTF_IDF(reviewsByCategories)
tf_idf_cosine_matrix = cosine_matrix(tf_idf_matrix)
print tf_idf_cosine_matrix
In [257]:
showSimilarityMatrix(tf_idf_cosine_matrix, 'Cosine matrix TF-IDF', 'tf_idf_cosine_matrix.png')
Based on previous similarity matrices we can see that many categories has quite high similarity measure. The reason is because built-in stop words from TfidfVectorizer is quite general and work well only for general texts but we bump with many general words specific for restaurant reviews. Examples could be 'good', 'just' or 'food'. From other side we need bear in mind that reviews could have words with misspellings but the IDF part will reward such words like this is a unique and very specific word that characterizes a text.
TfidfVectorizer from sklearn has two parameters for these purposes:
In [32]:
tuned_tf_idf_matrix = buildTF_IDF(reviewsByCategories, use_idf=True, max_df=0.8, min_df=2)
tuned_tf_idf_cosine_matrix = cosine_matrix(tuned_tf_idf_matrix)
print tuned_tf_idf_cosine_matrix
In [263]:
showSimilarityMatrix(tuned_tf_idf_cosine_matrix, 'Cosine matrix for tuned TF-IDF', 'tuned_tf_idf_cosine_matrix.png')
The similarity matrix looks now significant better, we can observe long distance between quite different cuisines while categories that actually relate to a same restaurant type have still high similarity, for example 'Pizza' and 'Italian' have quite high similarity, or another example is 'Japanese' and 'Sushi bars' are extremely similar (the distance is about 0.9).
To confirm this statement, we can make the next step and perform clustering. I chose the hierarchy clustering and clustering as dendrogram since it provides the easiest to read view and emphasize what cuisines are quite similar and how close are a whole groups of cuisines.
In [24]:
import pylab
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import linkage
def visualize_hierarchy_clusterization(cosine_matrix, filename, linkage_method='average'):
linkage_matrix = linkage(cosine_matrix, linkage_method)
fig = pylab.figure(figsize=(23.5, 12.5))
dendrogram(linkage_matrix, labels=cuisines, leaf_rotation=90., leaf_font_size=8.)
fig.savefig(filename)
plt.show()
In [315]:
visualize_hierarchy_clusterization(tuned_tf_idf_cosine_matrix, 'tuned_tf_idf_dendrogram.png')
We can see the result here is quite sensible. Let's follow when different cuisines are combined together during clustering. The first categories that has been combined were Japanese and Sushi Bars and indeed they offer almost identical dishes. The next candidate to combine were Nighlife and Bars, also very similiar type of places. The same think with Art & Entertainment and Event Planning. The next sensible combination is a combination of Mediaterian and Greek. From perspective of not so good combination can be Thai and a group of (Gluten-Free, Vegetarian, Vegan).
The idea of the next improvement is to add a pre-processing step in scope of which we'll delete punctuation and perform stemming. That should reduce the dimension of feature space and increase accuracy of comparison.
We are going to use nltk library (in particularly stemming package http://www.nltk.org/api/nltk.stem.html), but before we should download and install packages. This is an one-time oeration and can be done from python console.
In [313]:
import nltk
nltk.download()
Out[313]:
ntlk provides a series of Stemmers:
I'm going to use Lancaster because it is much newer, published in 1990, and can be more aggressive than the Porter stemming algorithm. Attempt to use WordNet Lemmatizer looks also as intresting option because it uses the WordNet Database to lookup lemmas. Lemmas differ from stems in that a lemma is a canonical form of the word, while a stem may not be a real word.
The preprocessing function is presented below. It will be used as a template for different stemmers.
In [9]:
import string
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def processReviews(category):
businessesByCategory = businessDF[contains(businessDF.categories, lit(category))]
selectedReviewsDF = reviewsDF.join(businessesByCategory,\
businessesByCategory.business_id == reviewsDF.business_id)
return (selectedReviewsDF.select("text")
.map(lambda x: x.text)
.map(lambda x: x.translate(remove_punctuation_map))
.reduce(lambda x,y : x + y))
reviewsWithoutPunctuationRDD = [processReviews(c) for c in cuisines]
In [10]:
import nltk
'''remove punctuation, lowercase, stem'''
def tokenize_with_stemmer(stemmer, text):
tokens = nltk.word_tokenize(text)
return [stemmer.stem(item) for item in tokens]
In [50]:
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
lancaster_tf_idf_matrix = buildTF_IDF(reviewsWithoutPunctuationRDD, use_idf=True, max_df=0.8, min_df=2,
tokenizer=(lambda x: tokenize_with_stemmer(lancaster_stemmer, x)))
lancaster_tf_idf_cosine_matrix = cosine_matrix(lancaster_tf_idf_matrix)
print lancaster_tf_idf_cosine_matrix
In [64]:
showSimilarityMatrix(lancaster_tf_idf_cosine_matrix, 'Cosine matrix for TF-IDF with Lancaster stemmer',
'lancaster_tf_idf_cosine_matrix.png')
In [65]:
visualize_hierarchy_clusterization(lancaster_tf_idf_cosine_matrix, 'lancaster_tf_idf_cosine_hierachy.png')
In [63]:
from nltk.stem.porter import PorterStemmer
porterStemmer = PorterStemmer()
porter_tf_idf_matrix = buildTF_IDF(reviewsWithoutPunctuationRDD, max_df=0.8, min_df=2,
tokenizer=(lambda x: tokenize_with_stemmer(porterStemmer, x)))
porter_tf_idf_cosine_matrix = cosine_matrix(porter_tf_idf_matrix)
print porter_tf_idf_cosine_matrix
In [66]:
showSimilarityMatrix(porter_tf_idf_cosine_matrix, 'Cosine matrix for TF-IDF with Porter stemmer',
'porter_tf_idf_cosine_matrix.png')
In [67]:
visualize_hierarchy_clusterization(porter_tf_idf_cosine_matrix, 'porter_tf_idf_cosine_hierachy.png')
In [72]:
import nltk
'''remove punctuation, lowercase, stem'''
def tokenize_with_lemmatizer(emmatizer, text):
tokens = nltk.word_tokenize(text)
return [emmatizer.lemmatize(item) for item in tokens]
In [73]:
from nltk.stem.wordnet import WordNetLemmatizer
wordNetLemmatizer = WordNetLemmatizer()
wordNet_tf_idf_matrix = buildTF_IDF(reviewsWithoutPunctuationRDD, max_df=0.8, min_df=2,
tokenizer=(lambda x: tokenize_with_lemmatizer(wordNetLemmatizer, x)))
wordNet_tf_idf_cosine_matrix = cosine_matrix(wordNet_tf_idf_matrix)
print wordNet_tf_idf_cosine_matrix
In [74]:
showSimilarityMatrix(wordNet_tf_idf_cosine_matrix, 'Cosine matrix for TF-IDF with Porter stemmer',
'wordNet_tf_idf_cosine_matrix.png')
In [75]:
visualize_hierarchy_clusterization(wordNet_tf_idf_cosine_matrix, 'wordNet_tf_idf_cosine_hierachy.png')
In [78]:
lancaster2_tf_idf_matrix = buildTF_IDF(reviewsWithoutPunctuationRDD, use_idf=True, min_df=2,
tokenizer=(lambda x: tokenize_with_stemmer(lancaster_stemmer, x)))
lancaster2_tf_idf_cosine_matrix = cosine_matrix(lancaster2_tf_idf_matrix)
print lancaster2_tf_idf_cosine_matrix
In [80]:
showSimilarityMatrix(lancaster2_tf_idf_cosine_matrix, 'Cosine matrix for TF-IDF with Lancaster stemmer 2',
'lancaster2_tf_idf_cosine_matrix.png')
We can see that Lancaster stemming algorithm is too agressive and it make all text very similiar and without filtering it is higher.
In [81]:
visualize_hierarchy_clusterization(lancaster2_tf_idf_cosine_matrix, 'lancaster2_tf_idf_cosine_hierachy.png')
The result of hierarchy clustering seems to be not so sensable, basically the algorithm combines almost all categories into one since they are very close to each other.
We can check a quality of built clusters. Since there are no known ground truth labels, so the evaluation must be performed using the model itself. Silhouette Index is one of the most popular ways of measuring a particular clustering's quality. It measures how similar each pattern is to the patterns in it's own cluster as compared to patterns in other clusters. Silhouette score close to one means that the datum is appropriately clustered. If the score is close to negative one, then by the same logic we see that some items would be more appropriate if it was clustered in its neighboring cluster. A score near zero means that the datum is on the border of two natural clusters.
We are going to use the implementation from scikit-learn python library for Silhouette score calculation (http://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient). For clustering we're going to use an enhanced version of K-means algorithm named Mini Batch K-Means (http://scikit-learn.org/stable/modules/clustering.html#mini-batch-k-means). Silhouette score will be depicted as a plot score/number of clusters.
In [55]:
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn import metrics
def calculate_silhouette_score(cosine_matrix, n_clusters):
model = MiniBatchKMeans(n_clusters=n_clusters, random_state=1).fit(cosine_matrix)
return metrics.silhouette_score(lancaster_tf_idf_cosine_matrix, model.labels_, metric='euclidean')
In [91]:
min_n_cluster = 2
max_n_cluster = 40
tf_silhouette_scores=[calculate_silhouette_score(tf_cosine_matrix, i) \
for i in range(min_n_cluster, max_n_cluster)]
tf_idf_silhouette_scores=[calculate_silhouette_score(tf_idf_cosine_matrix, i) \
for i in range(min_n_cluster, max_n_cluster)]
tuned_silhouette_scores=[calculate_silhouette_score(tuned_tf_idf_cosine_matrix, i) \
for i in range(min_n_cluster, max_n_cluster)]
lancaster_silhouette_scores=[calculate_silhouette_score(lancaster_tf_idf_cosine_matrix, i)\
for i in range(min_n_cluster, max_n_cluster)]
porter_silhouette_scores=[calculate_silhouette_score(porter_tf_idf_cosine_matrix, i)\
for i in range(min_n_cluster, max_n_cluster)]
wordNet_silhouette_scores=[calculate_silhouette_score(wordNet_tf_idf_cosine_matrix, i)\
for i in range(min_n_cluster, max_n_cluster)]
In [92]:
import matplotlib.pyplot as plt
import numpy as np
x = np.arange(min_n_cluster, max_n_cluster)
plt.plot(x, tf_silhouette_scores, 'r', label='TF model')
plt.plot(x, tf_idf_silhouette_scores, 'g', label='TF-IDF model')
plt.plot(x, tuned_silhouette_scores, 'y', label='tuned TF-IDF')
plt.plot(x, porter_silhouette_scores, 'b', label='tuned TF-IDF with Porter stemming algorithm')
plt.xlabel('number of clusters')
plt.ylabel('Silhouette score')
plt.title('Silhouette scores of K-Means')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.gcf().set_size_inches(23.5, 12.5)
plt.show()
From the graph above we can see that TF and TF-IDF model are significant worse than tuned TF-IDF and TF-IDF after stemming. The maximum value of Silhouette Index reaches for option TF-IDF with stemming and for 19 clusters. Let’s construct these clusters:
In [103]:
model = MiniBatchKMeans(n_clusters=18, random_state=1).fit(porter_tf_idf_cosine_matrix)
clusters = dict()
for i, label in enumerate(model.labels_):
if label not in clusters:
clusters[label] = [cuisines[i]]
else:
clusters[label].append(cuisines[i])
for i in clusters:
print "cluster {}: {}".format(i + 1, ', '.join(clusters[i]))
In [ ]: