Recently I got hotel accommodation reviews data to practice Sentiment Analysis with Natural Language Processing (NLP), which I previously just knew the basics and would like to gain hands-on experience for this Natural Language Understanding task. This notebook is to summarize my results.
We will perform Sentiment Analysis with NLP by applying the Occam's Razor Principle.
In [1]:
from __future__ import division
from __future__ import print_function
In [2]:
import numpy as np
import scipy as sp
import pandas as pd
import nltk
# When performing experiment, remove comment out for nltk.download().
# nltk.download()
import time
In [3]:
import warnings
warnings.filterwarnings("ignore")
The following are the scripts for Sentiment Analysis with NLP.
In [4]:
score_file = 'reviews_score.csv'
review_file = 'reviews.csv'
In [5]:
def read_score_review(score_file, review_file):
"""Read score and review data."""
score_df = pd.read_csv(score_file)
review_df = pd.read_csv(review_file)
return score_df, review_df
In [6]:
def groupby_agg_data(df, gkey='gkey', rid='rid'):
"""Group-by aggregate data."""
agg_df = (df.groupby(gkey)[rid]
.count()
.reset_index())
nan_count = df[gkey].isnull().sum()
nan_df = pd.DataFrame({gkey: [np.nan], rid: [nan_count]})
agg_df = agg_df.append(nan_df)[[gkey, rid]]
agg_df['percent'] = agg_df[rid] / agg_df[rid].sum()
return agg_df
In [7]:
def count_missing_data(df, cols='cols'):
"""Count missing records w.r.t. columns."""
print('Missing rows:')
for col in cols:
nan_rows = df[col].isnull().sum()
print('For {0}: {1}'.format(col, nan_rows))
In [8]:
def slice_abnormal_id(df, rid='hotel_review_id'):
"""View abnormal records with column"""
abnorm_bool_arr = (df[rid] == 0)
abnorm_count = abnorm_bool_arr.sum()
print('abnorm_count: {}'.format(abnorm_count))
abnorm_df = df[abnorm_bool_arr]
return abnorm_df
In [9]:
def remove_missing_abnormal_data(score_raw_df, review_raw_df,
rid='hotel_review_id',
score_col='rating_overall'):
"""Remove missing / abnormal data."""
filter_score_bool_arr = (score_raw_df[rid].notnull() &
score_raw_df[score_col].notnull())
score_df = score_raw_df[filter_score_bool_arr]
filter_review_bool_arr = review_raw_df[rid].notnull()
review_df = review_raw_df[filter_review_bool_arr]
return score_df, review_df
In [10]:
def join_score_review(score_df, review_df, on='hotel_review_id', how='left'):
"""Join score and review datasets."""
score_review_df = pd.merge(score_df, review_df, on=on, how=how)
score_review_count = score_review_df.shape[0]
print('score_review_count: {}'.format(score_review_count))
return score_review_df
In [11]:
def concat_review_title_comments(score_review_df,
concat_cols=['review_title', 'review_comments'],
concat_2col='review_title_comments'):
"""Concat review title and review comments."""
concat_text_col = ''
for concat_col in concat_cols:
concat_text_col += score_review_df[concat_col]
if concat_col != concat_cols[len(concat_cols) - 1]:
concat_text_col += '. '
score_review_df[concat_2col] = concat_text_col
return score_review_df
In [12]:
def lower_review_title_comments(score_review_df,
lower_col='review_title_comments'):
"""Lower sentences."""
score_review_df[lower_col] = score_review_df[lower_col].str.lower()
return score_review_df
In [13]:
def _tokenize_sen(sen):
"""Tokenize one sentence."""
from nltk.tokenize import word_tokenize
sen_token = word_tokenize(str(sen))
return sen_token
In [14]:
def _remove_nonstop_words_puncs(sen):
"""Remove nonstop words and meaningless punctuations in one sentence."""
from nltk.corpus import stopwords
sen_clean = [
word for word in sen
if word not in stopwords.words('english') and
word not in [',', '.', '(', ')', '&']]
return sen_clean
In [15]:
def tokenize_clean_sentence(sen):
"""Tokenize and clean one sentence."""
sen_token = _tokenize_sen(sen)
sen_token_clean = _remove_nonstop_words_puncs(sen_token)
return sen_token_clean
In [16]:
# def preprocess_sentence(df, sen_cols=['review_title', 'review_comments']):
# """Preprocess sentences (deprecated due to slow performance)."""
# for sen_col in sen_cols:
# print('Start tokenizing "{}"'.format(sen_col))
# sen_token_col = '{}_token'.format(sen_col)
# df[sen_token_col] = df[sen_col].apply(tokenize_clean_sentence)
# print('Finish tokenizing "{}"'.format(sen_col))
# return df
def preprocess_sentence_par(df, sen_col='review_title_comments',
sen_token_col='review_title_comments_token', num_proc=32):
"""Preporecess sentences in parallel.
Note: We apply multiprocessing with 32 cores; adjust `num_proc` by your computing environment.
"""
import multiprocessing as mp
pool = mp.Pool(num_proc)
df[sen_token_col] = pool.map_async(tokenize_clean_sentence , df[sen_col]).get()
return df
In [17]:
def get_bag_of_words(w_ls):
"""Get bag of words in word list."""
w_bow = dict([(w, True) for w in w_ls])
return w_bow
In [18]:
def get_bag_of_words_par(df, sen_token_col='review_title_comments_token',
bow_col='review_title_comments_bow', num_proc=32):
"""Get bag of words in parallel for sentences."""
import multiprocessing as mp
pool = mp.Pool(num_proc)
df[bow_col] = pool.map_async(get_bag_of_words , df[sen_token_col]).get()
return df
In [19]:
def label_review(df, scores_ls=None, label='negative',
score_col='rating_overall',
review_col='review_title_comments_bow'):
"""Label review by positive or negative."""
df_label = df[df[score_col].isin(scores_ls)]
label_review_ls = (df_label[review_col]
.apply(lambda bow: (bow, label))
.tolist())
return label_review_ls
In [20]:
def permutate(data_ls):
"""Randomly permutate data."""
np.random.shuffle(data_ls)
In [21]:
def create_train_test_sets(pos_review_ls, neg_review_ls, train_percent=0.75):
"""Create the training and test sets."""
neg_num = np.int(np.ceil(len(neg_review_ls) * train_percent))
pos_num = np.int(np.ceil(len(pos_review_ls) * train_percent))
train_set = neg_review_ls[:neg_num] + pos_review_ls[:pos_num]
permutate(train_set)
test_set = neg_review_ls[neg_num:] + pos_review_ls[pos_num:]
permutate(test_set)
return train_set, test_set
In [22]:
def train_naive_bayes(train_set):
from nltk.classify import NaiveBayesClassifier
nb_clf = NaiveBayesClassifier.train(train_set)
return nb_clf
In [23]:
def eval_naive_bayes(test_set, nb_clf):
import collections
from nltk.metrics.scores import precision
from nltk.metrics.scores import recall
ref_sets = {'positive': set(),
'negative': set()}
pred_sets = {'positive': set(),
'negative': set()}
for i, (bow, label) in enumerate(test_set):
ref_sets[label].add(i)
pred_label = nb_clf.classify(bow)
pred_sets[pred_label].add(i)
print('Positive precision:', precision(ref_sets['positive'], pred_sets['positive']))
print('Positive recall:', recall(ref_sets['positive'], pred_sets['positive']))
print('Negative precision:', precision(ref_sets['negative'], pred_sets['negative']))
print('Negative recall:', recall(ref_sets['negative'], pred_sets['negative']))
In [25]:
def pred_labels(df, clf,
bow_col='review_title_comments_bow',
pred_col='pred_label',
sel_cols=['rating_overall',
'review_title_comments_bow',
'pred_label']):
"""Predict labels for bag of words."""
df[pred_col] = df[bow_col].apply(clf.classify)
df_pred = df[sel_cols]
return df_pred
In [26]:
def get_boxplot_data(pred_label_df,
pred_col='pred_label', score_col='rating_overall'):
pos_data = pred_label_df[pred_label_df[pred_col] == 'positive'][score_col].values
neg_data = pred_label_df[pred_label_df[pred_col] == 'negative'][score_col].values
box_data = [pos_data, neg_data]
return box_data
In [27]:
def plot_box(d_ls, title='Box Plot', xlab='xlab', ylab='ylab',
xticks=None, xlim=None, ylim=None, figsize=(15, 10)):
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
plt.figure()
fig, ax = plt.subplots(figsize=figsize)
plt.boxplot(d_ls)
plt.title(title)
plt.xlabel(xlab)
plt.ylabel(ylab)
if xticks:
ax.set_xticklabels(xticks)
if xlim:
plt.xlim(xlim)
if ylim:
plt.ylim(ylim)
# plt.axis('auto')
plt.show()
We first read score and review raw datasets.
hotel_review_id: hotel review sequence IDrating_overall: overal accommodation ratinghotel_review_id: hotel review sequence IDreview_title: review titlereview_comments: detailed review comments
In [16]:
score_raw_df, review_raw_df = read_score_review(score_file, review_file)
print(len(score_raw_df))
print(len(review_raw_df))
In [17]:
score_raw_df.head(5)
Out[17]:
In [18]:
review_raw_df.head(5)
Out[18]:
In [19]:
count_missing_data(score_raw_df,
cols=['hotel_review_id', 'rating_overall'])
In [20]:
score_raw_df[score_raw_df.rating_overall.isnull()]
Out[20]:
In [21]:
count_missing_data(review_raw_df,
cols=['hotel_review_id', 'review_title', 'review_comments'])
In [22]:
abnorm_df = slice_abnormal_id(score_raw_df, rid='hotel_review_id')
abnorm_df
Out[22]:
In [23]:
abnorm_df = slice_abnormal_id(review_raw_df, rid='hotel_review_id')
abnorm_df
Out[23]:
In [24]:
score_raw_df.rating_overall.unique()
Out[24]:
In [25]:
score_agg_df = groupby_agg_data(
score_raw_df, gkey='rating_overall', rid='hotel_review_id')
score_agg_df
Out[25]:
In [155]:
score_df, review_df = remove_missing_abnormal_data(
score_raw_df, review_raw_df,
rid='hotel_review_id',
score_col='rating_overall')
In [156]:
score_df.head(5)
Out[156]:
In [157]:
review_df.head(5)
Out[157]:
In [158]:
score_review_df_ = join_score_review(score_df, review_df)
score_review_df_.head(5)
Out[158]:
In [159]:
score_review_df = concat_review_title_comments(
score_review_df_,
concat_cols=['review_title', 'review_comments'],
concat_2col='review_title_comments')
In [160]:
score_review_df.head(5)
Out[160]:
In [161]:
score_review_df = lower_review_title_comments(
score_review_df,
lower_col='review_title_comments')
In [162]:
score_review_df.head(5)
Out[162]:
Tokenizing is an important technique by which we would like to split the sentence into vector of invidual words. Nevertheless, there are many stopwords that are useless in natural language text, for example: he, is, at, which, and on. Thus we would like to remove them from the vector of tokenized words.
Note that since the tokenizing and removing stopwords tasks are time-consuming, we apply Python build-in package multiprocessing for parallel computing to improve the performance.
In [163]:
start_token_time = time.time()
score_review_token_df = preprocess_sentence_par(
score_review_df,
sen_col='review_title_comments',
sen_token_col='review_title_comments_token', num_proc=32)
end_token_time = time.time()
print('Time for tokenizing: {}'.format(end_token_time - start_token_time))
In [164]:
score_review_token_df.head(5)
Out[164]:
In [165]:
score_review_token_df.review_title_comments_token[1]
Out[165]:
The tokenized words may contain duplicated words, and for simplicity, we would like to apply the Bag of Words, which just represents the sentence as a bag (multiset) of its words, ignoring grammar and even word order. Here, following the Occam's Razor Principle again, we do not keep word frequencies, thus we use binary (presence/absence or True/False) weights.
In [166]:
start_bow_time = time.time()
score_review_bow_df = get_bag_of_words_par(
score_review_token_df,
sen_token_col='review_title_comments_token',
bow_col='review_title_comments_bow', num_proc=32)
end_bow_time= time.time()
print('Time for bag of words: {}'.format(end_bow_time - start_bow_time))
In [167]:
score_review_bow_df.review_title_comments_bow[:5]
Out[167]:
In [168]:
neg_review_ls = label_review(
score_review_bow_df,
scores_ls=[2, 3, 4], label='negative',
score_col='rating_overall',
review_col='review_title_comments_bow')
In [169]:
pos_review_ls = label_review(
score_review_bow_df,
scores_ls=[9, 10], label='positive',
score_col='rating_overall',
review_col='review_title_comments_bow')
In [170]:
neg_review_ls[1]
Out[170]:
In [171]:
pos_review_ls[1]
Out[171]:
In [190]:
train_set, test_set = create_train_test_sets(
pos_review_ls, neg_review_ls, train_percent=0.75)
In [195]:
train_set[10]
Out[195]:
In [230]:
nb_clf = train_naive_bayes(train_set)
In [231]:
eval_naive_bayes(test_set, nb_clf)
In [248]:
start_pred_time = time.time()
pred_label_df = pred_labels(
score_review_bow_df, nb_clf,
bow_col='review_title_comments_bow',
pred_col='pred_label')
end_pred_time = time.time()
print('Time for prediction: {}'.format(end_pred_time - start_pred_time))
In [249]:
pred_label_df.head(5)
Out[249]:
From the following boxplot, we can observe that our model performs reasonably well in the real world, even by our suprisingly simple machine learning modeling.
We can further apply divergence measures, such as Kullback-Leibler divergence, to quantify the rating_overall distribution distance between two label groups, if needed.
In [264]:
box_data = get_boxplot_data(
pred_label_df,
pred_col='pred_label', score_col='rating_overall')
In [267]:
plot_box(box_data, title='Box Plot for rating_overall by Sentiment Classes',
xlab='class', ylab='rating_overall',
xticks=['positive', 'negative'], figsize=(12, 7))
rating_overall. Nevertheless, the model performs comparably bad at negative reviews since some would produce above average rating_overall. The reason for this is because the rating_overall distribution is imbalanced and leads to much less negative reviews.