In this little exercise, I am going to have a look at the distribution of Yelp ratings (1 to 5 stars) and their correlations to business and user attributes. Eventually I am testing several ML algorithms to predict a rating from business / user attributes and the review text.
In [1]:
import os, sys
import numpy as np
import scipy as sp
import pandas as pd
import random
import re
import matplotlib
import matplotlib.pyplot as plt
#matplotlib.style.use('ggplot')
matplotlib.style.use('fivethirtyeight')
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
%config InlineBackend.figure_formats=['svg']
In [2]:
def format_column_names(dataFrame):
dataFrame.columns = dataFrame.columns.str.replace('[^\w.]+','_')
dataFrame.columns = dataFrame.columns.str.lower()
pd.options.display.max_seq_items = 500
In [3]:
# The fraction (random sample) of the review dataset, which is to be parsed
# Table joining and text vectorization is demanding in terms of memory and cpu load,
# so I recommend a value <= 0.1.
REVIEW_FRAC = 0.05
In [4]:
# Set the source directory for the input csv files (business.csv, user.csv, review.csv)
source_dir = os.path.join( os.getcwd(), 'scratch' )
required_files = ['business.csv', 'user.csv', 'review.csv']
nfiles_found = sum( os.path.isfile( os.path.join(source_dir, f) ) for f in required_files)
if nfiles_found < len(required_files):
source_dir = input('Specify CSV source directory: ')
if nfiles_found < len(required_files):
print('Source files not found.')
sys.exit(1)
print('Source directory: {0}'.format(source_dir))
In [5]:
businesses_file = os.path.join(source_dir, 'business.csv')
# Load business data
businesses = pd.read_csv( businesses_file,
parse_dates=True,
low_memory=False,
index_col='business_id'
)
format_column_names(businesses)
In [6]:
# Drop irrelevant columns
irrel_cols = [col for col in list(businesses) if col.startswith('attributes.hair_types')]
businesses.drop(irrel_cols, axis=1, inplace=True)
# Identify column starting with 'attribute'
attr_cols = [col for col in list(businesses) if col.startswith('attributes.')]
# Convert attribute columns to numeric values (no/undefined/yes -> 0.0/0.5/1.0)
businesses[attr_cols] = businesses[attr_cols].replace(
to_replace=[True, 'yes', 'full_bar', 'free', 'yes_free', 'quiet', 'yes_corkage', 'beer_and_wine'], value=1.0 )
businesses[attr_cols] = businesses[attr_cols].replace(
to_replace=[False, 'no', 'none', 'very_loud'], value=0.0 )
businesses[attr_cols] = businesses[attr_cols].apply(pd.to_numeric, errors='coerce')
businesses[attr_cols] = businesses[attr_cols].fillna(value=0.5)
# Convert categorical data
#businesses['city'] = pd.Categorical(businesses['city']).codes
businesses['city'] = pd.factorize(businesses['city'])[0]
#businesses.columns
businesses.info()
In [7]:
# Plot the business mean ratings
#star_counts = businesses.stars.value_counts(sort=False, normalize=True).sort_index()
#star_counts.plot(kind="bar", title="Business Mean Ratings", rot='0').set_xlabel('Rating')
In [8]:
users_file = os.path.join(source_dir, 'user.csv')
# Load user data
users = pd.read_csv( users_file, parse_dates=True, index_col='user_id' )
format_column_names(users)
compl_cols = [col for col in list(users) if col.startswith('compliments.')]
users['compliments'] = users[compl_cols].sum(axis=1)
vote_cols = [col for col in list(users) if col.startswith('votes.')]
users['votes'] = users[vote_cols].sum(axis=1)
#users.columns
users.info()
In [9]:
reviews_file = os.path.join(source_dir, 'review.csv')
# count lines
#num_lines = sum(1 for _ in open(reviews_file))
num_lines = 10000000
# configure random line indices to skip
random.seed(123)
skip_idx = random.sample(range(1, num_lines), num_lines - int(REVIEW_FRAC*num_lines))
# only load a random fraction of the reviews dataset, specified by REVIEW_FRAC
reviews = pd.read_csv( reviews_file,
parse_dates=True,
index_col='review_id',
skiprows=skip_idx
)
format_column_names(reviews)
reviews['text_length'] = reviews['text'].str.len()
reviews['text_wc'] = reviews['text'].str.split().apply(len)
vote_cols = [col for col in list(reviews) if col.startswith('votes.')]
reviews['votes'] = reviews[vote_cols].sum(axis=1)
times = pd.DatetimeIndex(reviews.date)
reviews['year'] = times.year
#reviews.columns
reviews.info()
In [10]:
%time rb = pd.merge(reviews, businesses, how='left', left_on='business_id', right_index=True, suffixes=('@reviews', '@businesses'))
%time rbu = pd.merge(rb, users, how='left', left_on='user_id', right_index=True, suffixes=('@reviews', '@users'))
#rbu['stars@reviews'].loc[rbu['votes@reviews'] >= 1].size
del businesses
del users
del reviews
#rbu.columns
rbu.info()
In [11]:
rbu['votes@reviews'].value_counts(normalize=True).ix[:20] \
.plot.bar(rot=90, title='Distribution of votes per review')
Out[11]:
In [12]:
star_counts = rbu['stars@reviews'].value_counts(normalize=True).sort_index()
star_counts_min1 = rbu['stars@reviews'].loc[rbu['votes@reviews'] >= 1].value_counts(normalize=True).sort_index()
star_counts_min5 = rbu['stars@reviews'].loc[rbu['votes@reviews'] >= 5].value_counts(normalize=True).sort_index()
star_counts_comb = pd.concat([star_counts, star_counts_min1, star_counts_min5], axis=1)
star_counts_comb.columns = ['all', 'minimum of 1 vote', 'minimum of 5 votes']
star_counts_comb.plot.bar(title="Distribution of ratings", stacked=False, rot=0).set_xlabel('Rating')
Out[12]:
In [13]:
MIN_VOTES = 5
rbu_min_votes = rbu.loc[rbu['votes@reviews'] >= MIN_VOTES]
star_counts_per_year = rbu_min_votes.groupby(['year'])['stars@reviews'].value_counts(normalize=True).unstack().transpose()
star_counts_per_year[star_counts_per_year.columns[-5:]].plot.bar(title="Distribution of ratings per year (min. of {0} votes)".format(MIN_VOTES), stacked=False, rot=0).set_xlabel('Rating')
Out[13]:
In [14]:
cols = ['attributes.accepts_credit_cards',
'attributes.alcohol',
'attributes.by_appointment_only',
'attributes.caters',
'attributes.coat_check',
'attributes.corkage',
'attributes.delivery',
'attributes.dogs_allowed',
'attributes.drive_thru'] \
+ [col for col in list(rbu) if col.startswith('attributes.good_for')] \
+ ['attributes.happy_hour',
'attributes.has_tv',
'attributes.noise_level',
'attributes.open_24_hours',
'attributes.order_at_counter',
'attributes.outdoor_seating',
'attributes.price_range',
'attributes.smoking',
'attributes.take_out',
'attributes.takes_reservations',
'attributes.waiter_service',
'attributes.wheelchair_accessible',
'attributes.wi_fi',
'review_count@users',
'compliments']
correls = rbu[cols].corrwith(rbu['stars@reviews'], drop=True).sort_values()
correls.plot.bar(figsize=(10,4), title='Correlation of business attributes (#1) with avg. rating')
Out[14]:
In [15]:
cols = [col for col in list(rbu) if re.search('(\.ambience|\.music|\.parking)', col)]
correls = rbu[cols].corrwith(rbu['stars@businesses'], drop=True).sort_values()
correls.plot.bar(figsize=(10,4), title='Correlation of business attributes (#2) with avg. rating')
Out[15]:
In [16]:
cols = ['compliments', 'votes@reviews', 'votes@users', 'review_count@users', 'fans']
correls = rbu[cols].corrwith(rbu['stars@reviews'], drop=True).sort_values()
correls.plot.bar(title='Correlation of user attributes & review votes with ratings')
Out[16]:
In [17]:
# rbu.loc[rbu['votes'] >= 5].groupby(['stars_review'])['text_length'].mean().plot.bar(title="Mean review length (characters) vs. rating (min. of 5 votes)", stacked=False, rot=0).set_xlabel('Rating')
In [18]:
rbu.groupby(['stars@reviews'])['text_wc'].mean().plot.bar(title="Mean review word counts vs. rating (min. of 5 votes)", stacked=False, rot=0).set_xlabel('Rating')
Out[18]:
In [19]:
data_train, data_test = train_test_split(
rbu,
test_size = 0.2,
random_state=1
)
# column containting the review texts
text_col = 'text'
# column containing the label
label_col = 'stars@reviews'
# columns containting attribute features
attr_cols = [col for col in list(rbu) if re.match('(votes\..+@users|votes\..+@reviews|attributes\.|city)', col)]
#attr_cols
In [20]:
# train the vectorizer from training data (review texts)
vect = CountVectorizer(
stop_words='english',
ngram_range=(1, 2),
strip_accents='unicode',
max_df=0.9,
min_df=3,
max_features=100000
)
vect.fit(data_train[text_col])
Out[20]:
In [21]:
# collect features
#X_train_attr = data_train[attr_cols]
#X_test_attr = data_test[attr_cols]
X_train_attr = pd.get_dummies( data_train[attr_cols] )
X_test_attr = pd.get_dummies( data_test[attr_cols] )
X_train_dtm = vect.transform(data_train[text_col])
X_test_dtm = vect.transform(data_test[text_col])
# collect labels
y_train = data_train[label_col]
y_test = data_test[label_col]
In [22]:
# combine attribute matrices and sparse document-term matrices
X_train = sp.sparse.hstack((X_train_dtm, X_train_attr))
X_test = sp.sparse.hstack((X_test_dtm, X_test_attr))
In [23]:
# use a Multinomial Naive Bayes model
nb = MultinomialNB(
alpha=0.1
)
nb.fit(X_train_dtm, y_train)
Out[23]:
In [24]:
# make class predictions for X_test_dtm
y_pred = nb.predict(X_test_dtm)
In [25]:
# calculate accuracy of class predictions
metrics.accuracy_score(y_test, y_pred)
Out[25]:
In [26]:
# calculate the mean error (more meaningful in our case)
metrics.mean_absolute_error(y_test, y_pred)
Out[26]:
In [27]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred)
Out[27]:
In [28]:
# examine how often tokens of our dictionary appear in each rating category
tokens = pd.DataFrame(
{'token': vect.get_feature_names(),
'1 star' : nb.feature_count_[0, :] / nb.class_count_[0],
'2 stars' : nb.feature_count_[1, :] / nb.class_count_[1],
'3 stars' : nb.feature_count_[2, :] / nb.class_count_[2] ,
'4 stars' : nb.feature_count_[3, :] / nb.class_count_[3] ,
'5 stars' : nb.feature_count_[4, :] / nb.class_count_[4]}
).set_index('token')
tokens.sample(10, random_state=3)
Out[28]:
In [29]:
# use a Multinomial Naive Bayes model
nb = MultinomialNB(
alpha=0.1
)
nb.fit(X_train_attr, y_train)
Out[29]:
In [30]:
# make class predictions for X_test_dtm
y_pred = nb.predict(X_test_attr)
In [31]:
# calculate accuracy of class predictions
metrics.accuracy_score(y_test, y_pred)
Out[31]:
In [32]:
# calculate the mean error (more meaningful in our case)
metrics.mean_absolute_error(y_test, y_pred)
Out[32]:
In [33]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred)
Out[33]:
In [34]:
# use a Multinomial Naive Bayes model
nb = MultinomialNB(
alpha=0.1
)
nb.fit(X_train, y_train)
Out[34]:
In [35]:
# make class predictions for X_test_dtm
y_pred = nb.predict(X_test)
In [36]:
# calculate accuracy of class predictions
metrics.accuracy_score(y_test, y_pred)
Out[36]:
In [37]:
# calculate the mean error (more meaningful in our case)
metrics.mean_absolute_error(y_test, y_pred)
Out[37]:
In [38]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred)
Out[38]:
In [39]:
# use a linear model with stochastic gradient descent (SGD)
sgd = SGDClassifier(
loss='modified_huber',
penalty='l2',
alpha=1e-3,
n_iter=20,
n_jobs=1,
random_state=0)
sgd.fit(X_train_dtm, y_train)
Out[39]:
In [40]:
# make class predictions for X_test_dtm
y_pred = sgd.predict(X_test_dtm)
In [41]:
# calculate accuracy of class predictions
metrics.accuracy_score(y_test, y_pred)
Out[41]:
In [42]:
# calculate the mean error (more meaningful in our case)
metrics.mean_absolute_error(y_test, y_pred)
Out[42]:
In [43]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred)
Out[43]:
In [44]:
dtree = DecisionTreeClassifier(
criterion='entropy',
random_state=0,
min_samples_leaf=10,
max_depth=None
)
dtree.fit(X_train, y_train)
Out[44]:
In [45]:
# make class predictions for X_test_dtm
y_pred = dtree.predict(X_test)
In [46]:
# calculate accuracy of class predictions
metrics.accuracy_score(y_test, y_pred)
Out[46]:
In [47]:
# calculate the mean error
metrics.mean_absolute_error(y_test, y_pred)
Out[47]:
In [48]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred)
Out[48]:
In [49]:
rf = RandomForestClassifier(
random_state=0,
n_estimators=20,
criterion='entropy',
n_jobs=3,
max_depth=150
)
rf.fit(X_train, y_train)
Out[49]:
In [50]:
# make class predictions for X_test_dtm
y_pred = rf.predict(X_test)
In [51]:
# calculate accuracy of class predictions
metrics.accuracy_score(y_test, y_pred)
Out[51]:
In [52]:
# calculate the mean error
metrics.mean_absolute_error(y_test, y_pred)
Out[52]:
In [53]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred)
Out[53]:
In [ ]: