In [0]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
sns.set()
In [0]:
df = pd.read_csv('https://storage.googleapis.com/datascience360/booking_com_hotel_reviews.csv')
In [5]:
df.head()
Out[5]:
In [6]:
df.tail()
Out[6]:
In [7]:
df.info()
In [0]:
# Prepare the dataset, extract representative keywords from positive and negative reviews
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(analyzer = 'word', stop_words = 'english', max_features = 20,ngram_range=(2,2))
negative_features = cv.fit_transform(df['Negative_Review'])
negative_counts = negative_features.A.sum(axis=0)
negative_words = cv.get_feature_names()
negative_terms = dict(zip(negative_words,negative_counts))
top_negative_terms = sorted(negative_terms.items() , key=lambda x: x[1], reverse=True)
df_negative = pd.DataFrame(top_negative_terms, columns=["term", "frequency"])
positive_features = cv.fit_transform(df['Positive_Review'])
positive_counts = positive_features.A.sum(axis=0)
positive_words = cv.get_feature_names()
positive_terms = dict(zip(positive_words,positive_counts))
top_positive_terms = sorted(positive_terms.items() , key=lambda x: x[1], reverse=True)
df_positive = pd.DataFrame(top_positive_terms, columns=["term", "frequency"])
In [9]:
print('Top 20 terms in Negative Reviews:')
print('--------------------------------------------')
print(df_negative)
print()
print('Top 20 terms in Positive Reviews:')
print('--------------------------------------------')
print(df_positive)
In [10]:
sns.set(rc={'figure.figsize':(12,10)})
ax = sns.barplot(x="term", y="frequency", data = df_negative)
loc, labels = plt.xticks()
ax.set_xticklabels(labels, rotation=90);
In [0]:
neg = {"reviews":df['Negative_Review'],"sentiment": len(df['Negative_Review'])*["negative"]}
pos = {"reviews":df['Positive_Review'],"sentiment": len(df['Positive_Review'])*["positive"]}
df_senti = pd.DataFrame(neg).append(pd.DataFrame(pos))
In [12]:
df_senti.head()
Out[12]:
In [13]:
df_senti.tail()
Out[13]:
In [14]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer (max_features=150, min_df=7, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(df_senti['reviews']).toarray()
In [15]:
X = processed_features
y = df_senti['sentiment']
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X, y)
Out[15]:
In [16]:
comment = "The food here are awful"
processed_comment = vectorizer.transform([comment])
lr.predict(processed_comment)
Out[16]:
In [17]:
from sklearn.metrics import accuracy_score
predictions = lr.predict(X)
accuracy_score(y,predictions)
Out[17]:
In [0]: