In [0]:
%matplotlib inline

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'
sns.set()

In [0]:
df = pd.read_csv('https://storage.googleapis.com/datascience360/booking_com_hotel_reviews.csv')

In [5]:
df.head()


Out[5]:
Unnamed: 0 Hotel_Address Hotel_Name Reviewer_Nationality Negative_Review Positive_Review Reviewer_Score lat lng
0 0 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... Hotel Arena Russia I am so angry that i made this post available... Only the park outside of the hotel was beauti... 2.9 52.360576 4.915968
1 1 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... Hotel Arena Ireland No Negative No real complaints the hotel was great great ... 7.5 52.360576 4.915968
2 2 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... Hotel Arena Australia Rooms are nice but for elderly a bit difficul... Location was good and staff were ok It is cut... 7.1 52.360576 4.915968
3 3 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... Hotel Arena United Kingdom My room was dirty and I was afraid to walk ba... Great location in nice surroundings the bar a... 3.8 52.360576 4.915968
4 4 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... Hotel Arena New Zealand You When I booked with your company on line y... Amazing location and building Romantic setting 6.7 52.360576 4.915968

In [6]:
df.tail()


Out[6]:
Unnamed: 0 Hotel_Address Hotel_Name Reviewer_Nationality Negative_Review Positive_Review Reviewer_Score lat lng
515733 515733 Wurzbachgasse 21 15 Rudolfsheim F nfhaus 1150 ... Atlantis Hotel Vienna Kuwait no trolly or staff to help you take the lugga... location 7.0 48.203745 16.335677
515734 515734 Wurzbachgasse 21 15 Rudolfsheim F nfhaus 1150 ... Atlantis Hotel Vienna Estonia The hotel looks like 3 but surely not 4 Breakfast was ok and we got earlier check in 5.8 48.203745 16.335677
515735 515735 Wurzbachgasse 21 15 Rudolfsheim F nfhaus 1150 ... Atlantis Hotel Vienna Egypt The ac was useless It was a hot week in vienn... No Positive 2.5 48.203745 16.335677
515736 515736 Wurzbachgasse 21 15 Rudolfsheim F nfhaus 1150 ... Atlantis Hotel Vienna Mexico No Negative The rooms are enormous and really comfortable... 8.8 48.203745 16.335677
515737 515737 Wurzbachgasse 21 15 Rudolfsheim F nfhaus 1150 ... Atlantis Hotel Vienna Hungary I was in 3rd floor It didn t work Free Wife staff was very kind 8.3 48.203745 16.335677

In [7]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515738 entries, 0 to 515737
Data columns (total 9 columns):
Unnamed: 0              515738 non-null int64
Hotel_Address           515738 non-null object
Hotel_Name              515738 non-null object
Reviewer_Nationality    515738 non-null object
Negative_Review         515738 non-null object
Positive_Review         515738 non-null object
Reviewer_Score          515738 non-null float64
lat                     512470 non-null float64
lng                     512470 non-null float64
dtypes: float64(3), int64(1), object(5)
memory usage: 35.4+ MB

In [0]:
# Prepare the dataset, extract representative keywords from positive and negative reviews
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer = 'word', stop_words = 'english', max_features = 20,ngram_range=(2,2))
negative_features = cv.fit_transform(df['Negative_Review'])
negative_counts = negative_features.A.sum(axis=0)
negative_words = cv.get_feature_names()
negative_terms = dict(zip(negative_words,negative_counts))
top_negative_terms = sorted(negative_terms.items() ,  key=lambda x: x[1], reverse=True)
df_negative = pd.DataFrame(top_negative_terms, columns=["term", "frequency"])


positive_features = cv.fit_transform(df['Positive_Review'])
positive_counts = positive_features.A.sum(axis=0)
positive_words = cv.get_feature_names()
positive_terms = dict(zip(positive_words,positive_counts))
top_positive_terms = sorted(positive_terms.items() ,  key=lambda x: x[1], reverse=True)
df_positive = pd.DataFrame(top_positive_terms, columns=["term", "frequency"])

In [9]:
print('Top 20 terms in Negative Reviews:')
print('--------------------------------------------')
print(df_negative)

print()

print('Top 20 terms in Positive Reviews:')
print('--------------------------------------------')
print(df_positive)


Top 20 terms in Negative Reviews:
--------------------------------------------
                   term  frequency
0            room small       9943
1          room service       6292
2            small room       5562
3      air conditioning       5525
4           booking com       4469
5             didn work       4173
6            star hotel       3993
7            tea coffee       3957
8             didn like       3702
9           rooms small       3525
10   breakfast included       3050
11             room bit       2895
12           double bed       2822
13            bit small       2764
14           little bit       2762
15          room little       2752
16             mini bar       2723
17             did work       2588
18  breakfast expensive       2509
19                wi fi       2407

Top 20 terms in Positive Reviews:
--------------------------------------------
                  term  frequency
0       great location      30092
1       staff friendly      24716
2       friendly staff      24625
3     friendly helpful      21496
4        good location      19564
5        staff helpful      17165
6        helpful staff      16887
7   excellent location      11845
8       location great      10718
9        location good      10281
10      breakfast good       9139
11      good breakfast       9088
12     comfortable bed       8844
13          room clean       8527
14    walking distance       8407
15  location excellent       7775
16     bed comfortable       7508
17         staff great       7143
18    helpful friendly       7102
19         value money       6949

In [10]:
sns.set(rc={'figure.figsize':(12,10)})
ax = sns.barplot(x="term", y="frequency", data = df_negative)
loc, labels = plt.xticks()
ax.set_xticklabels(labels, rotation=90);



In [0]:
neg = {"reviews":df['Negative_Review'],"sentiment": len(df['Negative_Review'])*["negative"]}
pos = {"reviews":df['Positive_Review'],"sentiment": len(df['Positive_Review'])*["positive"]}

df_senti = pd.DataFrame(neg).append(pd.DataFrame(pos))

In [12]:
df_senti.head()


Out[12]:
reviews sentiment
0 I am so angry that i made this post available... negative
1 No Negative negative
2 Rooms are nice but for elderly a bit difficul... negative
3 My room was dirty and I was afraid to walk ba... negative
4 You When I booked with your company on line y... negative

In [13]:
df_senti.tail()


Out[13]:
reviews sentiment
515733 location positive
515734 Breakfast was ok and we got earlier check in positive
515735 No Positive positive
515736 The rooms are enormous and really comfortable... positive
515737 staff was very kind positive

In [14]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer (max_features=150, min_df=7,  stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(df_senti['reviews']).toarray()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

In [15]:
X = processed_features
y = df_senti['sentiment']

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X, y)


/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Out[15]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
comment = "The food here are awful"
processed_comment = vectorizer.transform([comment])
lr.predict(processed_comment)


Out[16]:
array(['negative'], dtype=object)

In [17]:
from sklearn.metrics import accuracy_score

predictions = lr.predict(X)

accuracy_score(y,predictions)


Out[17]:
0.9046095110307947

In [0]: