In [66]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
%matplotlib inline

In [4]:
yelp = pd.read_csv("yelp.csv")

In [5]:
yelp.head()


Out[5]:
business_id date review_id stars text type user_id cool useful funny
0 9yKzy9PApeiPPOUJEtnvkg 2011-01-26 fWKvX83p0-ka4JS3dc6E5A 5 My wife took me here on my birthday for breakf... review rLtl8ZkDX5vH5nAx9C3q5Q 2 5 0
1 ZRJwVLyzEJq1VAihDhYiow 2011-07-27 IjZ33sJrzXqU-0X6U8NwyA 5 I have no idea why some people give bad review... review 0a2KyEL0d3Yb1V6aivbIuQ 0 0 0
2 6oRAC4uyJCsJl1X0WZpVSA 2012-06-14 IESLBzqUCLdSzSqm0eCSxQ 4 love the gyro plate. Rice is so good and I als... review 0hT2KtfLiobPvh6cDC8JQg 0 1 0
3 _1QQZuf4zZOyFCvXc0o6Vg 2010-05-27 G-WvGaISbqqaMHlNnByodA 5 Rosie, Dakota, and I LOVE Chaparral Dog Park!!... review uZetl9T0NcROGOyFfughhg 1 2 0
4 6ozycU1RpktNG2-1BroVtw 2012-01-05 1uJFq2r5QfJG_6ExMRCaGw 5 General Manager Scott Petello is a good egg!!!... review vYmM4KTsC8ZfQBg-j5MWkw 0 0 0

In [6]:
yelp.describe()


Out[6]:
stars cool useful funny
count 10000.000000 10000.000000 10000.000000 10000.000000
mean 3.777500 0.876800 1.409300 0.701300
std 1.214636 2.067861 2.336647 1.907942
min 1.000000 0.000000 0.000000 0.000000
25% 3.000000 0.000000 0.000000 0.000000
50% 4.000000 0.000000 1.000000 0.000000
75% 5.000000 1.000000 2.000000 1.000000
max 5.000000 77.000000 76.000000 57.000000

In [7]:
yelp.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
business_id    10000 non-null object
date           10000 non-null object
review_id      10000 non-null object
stars          10000 non-null int64
text           10000 non-null object
type           10000 non-null object
user_id        10000 non-null object
cool           10000 non-null int64
useful         10000 non-null int64
funny          10000 non-null int64
dtypes: int64(4), object(6)
memory usage: 781.3+ KB

In [8]:
yelp["text length"] = yelp["text"].apply(len)

In [9]:
yelp.head()


Out[9]:
business_id date review_id stars text type user_id cool useful funny text length
0 9yKzy9PApeiPPOUJEtnvkg 2011-01-26 fWKvX83p0-ka4JS3dc6E5A 5 My wife took me here on my birthday for breakf... review rLtl8ZkDX5vH5nAx9C3q5Q 2 5 0 889
1 ZRJwVLyzEJq1VAihDhYiow 2011-07-27 IjZ33sJrzXqU-0X6U8NwyA 5 I have no idea why some people give bad review... review 0a2KyEL0d3Yb1V6aivbIuQ 0 0 0 1345
2 6oRAC4uyJCsJl1X0WZpVSA 2012-06-14 IESLBzqUCLdSzSqm0eCSxQ 4 love the gyro plate. Rice is so good and I als... review 0hT2KtfLiobPvh6cDC8JQg 0 1 0 76
3 _1QQZuf4zZOyFCvXc0o6Vg 2010-05-27 G-WvGaISbqqaMHlNnByodA 5 Rosie, Dakota, and I LOVE Chaparral Dog Park!!... review uZetl9T0NcROGOyFfughhg 1 2 0 419
4 6ozycU1RpktNG2-1BroVtw 2012-01-05 1uJFq2r5QfJG_6ExMRCaGw 5 General Manager Scott Petello is a good egg!!!... review vYmM4KTsC8ZfQBg-j5MWkw 0 0 0 469

In [10]:
g = sns.FacetGrid(yelp,col='stars')
g.map(plt.hist,'text length')


Out[10]:
<seaborn.axisgrid.FacetGrid at 0x23aa145fd30>

In [13]:
sns.boxplot(x = "stars", y = "text length", data = yelp)


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x23aa16029b0>

In [14]:
sns.countplot("stars", data=yelp)


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x23aa1736eb8>

In [35]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df = yelp.groupby("stars").mean()
df


Out[35]:
cool useful funny text length
stars
1 0.576769 1.604806 1.056075 826.515354
2 0.719525 1.563107 0.875944 842.256742
3 0.788501 1.306639 0.694730 758.498289
4 0.954623 1.395916 0.670448 712.923142
5 0.944261 1.381780 0.608631 624.999101

In [38]:
stars_corr = df.corr()
stars_corr


Out[38]:
cool useful funny text length
cool 1.000000 -0.743329 -0.944939 -0.857664
useful -0.743329 1.000000 0.894506 0.699881
funny -0.944939 0.894506 1.000000 0.843461
text length -0.857664 0.699881 0.843461 1.000000

In [39]:
sns.heatmap(stars_corr, annot=True)


Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x23aa1c120f0>

In [47]:
yelp_class = yelp.loc[yelp["stars"].isin([1, 5])]

In [53]:
yelp_class.describe()


Out[53]:
stars cool useful funny text length
count 4086.000000 4086.000000 4086.000000 4086.000000 4086.000000
mean 4.266765 0.876897 1.422663 0.690651 661.938815
std 1.547868 2.336611 2.598515 1.961751 601.621371
min 1.000000 0.000000 0.000000 0.000000 6.000000
25% 5.000000 0.000000 0.000000 0.000000 256.000000
50% 5.000000 0.000000 1.000000 0.000000 489.500000
75% 5.000000 1.000000 2.000000 1.000000 878.000000
max 5.000000 77.000000 76.000000 39.000000 4986.000000

In [54]:
X = yelp_class["text"]
y = yelp_class["stars"]

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [56]:
X = cv.fit_transform(X)

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3)

In [62]:
from sklearn.naive_bayes import MultinomialNB

In [63]:
nb = MultinomialNB()

In [64]:
nb.fit(X_train, y_train)


Out[64]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [67]:
pred = nb.predict(X_test)

In [68]:
from sklearn.metrics import confusion_matrix, classification_report

In [69]:
print(confusion_matrix(pred, y_test))
print("\n")
print(classification_report(pred, y_test))


[[144  28]
 [ 81 973]]


             precision    recall  f1-score   support

          1       0.64      0.84      0.73       172
          5       0.97      0.92      0.95      1054

avg / total       0.93      0.91      0.92      1226

Not bad!


In [70]:
from sklearn.feature_extraction.text import TfidfTransformer

In [71]:
from sklearn.pipeline import Pipeline

In [72]:
pipeline = Pipeline([
    ("cv", CountVectorizer()),
    ("Tfidf", TfidfTransformer()),
    ("mnb", MultinomialNB())
])

In [74]:
X = yelp_class["text"]
y = yelp_class["stars"]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3)

In [75]:
pipeline.fit(X_train, y_train)


Out[75]:
Pipeline(steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_a...inear_tf=False, use_idf=True)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [76]:
ppred = pipeline.predict(X_test)

In [77]:
print(confusion_matrix(ppred, y_test))
print("\n")
print(classification_report(ppred, y_test))


[[   1    0]
 [ 202 1023]]


             precision    recall  f1-score   support

          1       0.00      1.00      0.01         1
          5       1.00      0.84      0.91      1225

avg / total       1.00      0.84      0.91      1226

Slightly Worse!


In [ ]: