notebook.community

Edit and run



In [66]:

    
import pandas as pd
import numpy as np



In [2]:

    
import matplotlib.pyplot as plt
import seaborn as sns



In [3]:

    
%matplotlib inline



In [4]:

    
yelp = pd.read_csv("yelp.csv")



In [5]:

    
yelp.head()









    Out[5]:







  
    
      
      business_id
      date
      review_id
      stars
      text
      type
      user_id
      cool
      useful
      funny
    
  
  
    
      0
      9yKzy9PApeiPPOUJEtnvkg
      2011-01-26
      fWKvX83p0-ka4JS3dc6E5A
      5
      My wife took me here on my birthday for breakf...
      review
      rLtl8ZkDX5vH5nAx9C3q5Q
      2
      5
      0
    
    
      1
      ZRJwVLyzEJq1VAihDhYiow
      2011-07-27
      IjZ33sJrzXqU-0X6U8NwyA
      5
      I have no idea why some people give bad review...
      review
      0a2KyEL0d3Yb1V6aivbIuQ
      0
      0
      0
    
    
      2
      6oRAC4uyJCsJl1X0WZpVSA
      2012-06-14
      IESLBzqUCLdSzSqm0eCSxQ
      4
      love the gyro plate. Rice is so good and I als...
      review
      0hT2KtfLiobPvh6cDC8JQg
      0
      1
      0
    
    
      3
      _1QQZuf4zZOyFCvXc0o6Vg
      2010-05-27
      G-WvGaISbqqaMHlNnByodA
      5
      Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
      review
      uZetl9T0NcROGOyFfughhg
      1
      2
      0
    
    
      4
      6ozycU1RpktNG2-1BroVtw
      2012-01-05
      1uJFq2r5QfJG_6ExMRCaGw
      5
      General Manager Scott Petello is a good egg!!!...
      review
      vYmM4KTsC8ZfQBg-j5MWkw
      0
      0
      0



In [6]:

    
yelp.describe()









    Out[6]:







  
    
      
      stars
      cool
      useful
      funny
    
  
  
    
      count
      10000.000000
      10000.000000
      10000.000000
      10000.000000
    
    
      mean
      3.777500
      0.876800
      1.409300
      0.701300
    
    
      std
      1.214636
      2.067861
      2.336647
      1.907942
    
    
      min
      1.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      3.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      4.000000
      0.000000
      1.000000
      0.000000
    
    
      75%
      5.000000
      1.000000
      2.000000
      1.000000
    
    
      max
      5.000000
      77.000000
      76.000000
      57.000000



In [7]:

    
yelp.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
business_id    10000 non-null object
date           10000 non-null object
review_id      10000 non-null object
stars          10000 non-null int64
text           10000 non-null object
type           10000 non-null object
user_id        10000 non-null object
cool           10000 non-null int64
useful         10000 non-null int64
funny          10000 non-null int64
dtypes: int64(4), object(6)
memory usage: 781.3+ KB



In [8]:

    
yelp["text length"] = yelp["text"].apply(len)



In [9]:

    
yelp.head()









    Out[9]:







  
    
      
      business_id
      date
      review_id
      stars
      text
      type
      user_id
      cool
      useful
      funny
      text length
    
  
  
    
      0
      9yKzy9PApeiPPOUJEtnvkg
      2011-01-26
      fWKvX83p0-ka4JS3dc6E5A
      5
      My wife took me here on my birthday for breakf...
      review
      rLtl8ZkDX5vH5nAx9C3q5Q
      2
      5
      0
      889
    
    
      1
      ZRJwVLyzEJq1VAihDhYiow
      2011-07-27
      IjZ33sJrzXqU-0X6U8NwyA
      5
      I have no idea why some people give bad review...
      review
      0a2KyEL0d3Yb1V6aivbIuQ
      0
      0
      0
      1345
    
    
      2
      6oRAC4uyJCsJl1X0WZpVSA
      2012-06-14
      IESLBzqUCLdSzSqm0eCSxQ
      4
      love the gyro plate. Rice is so good and I als...
      review
      0hT2KtfLiobPvh6cDC8JQg
      0
      1
      0
      76
    
    
      3
      _1QQZuf4zZOyFCvXc0o6Vg
      2010-05-27
      G-WvGaISbqqaMHlNnByodA
      5
      Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
      review
      uZetl9T0NcROGOyFfughhg
      1
      2
      0
      419
    
    
      4
      6ozycU1RpktNG2-1BroVtw
      2012-01-05
      1uJFq2r5QfJG_6ExMRCaGw
      5
      General Manager Scott Petello is a good egg!!!...
      review
      vYmM4KTsC8ZfQBg-j5MWkw
      0
      0
      0
      469



In [10]:

    
g = sns.FacetGrid(yelp,col='stars')
g.map(plt.hist,'text length')









    Out[10]:





<seaborn.axisgrid.FacetGrid at 0x23aa145fd30>



In [13]:

    
sns.boxplot(x = "stars", y = "text length", data = yelp)









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x23aa16029b0>



In [14]:

    
sns.countplot("stars", data=yelp)









    Out[14]:





<matplotlib.axes._subplots.AxesSubplot at 0x23aa1736eb8>



In [35]:

    
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df = yelp.groupby("stars").mean()
df









    Out[35]:







  
    
      
      cool
      useful
      funny
      text length
    
    
      stars
      
      
      
      
    
  
  
    
      1
      0.576769
      1.604806
      1.056075
      826.515354
    
    
      2
      0.719525
      1.563107
      0.875944
      842.256742
    
    
      3
      0.788501
      1.306639
      0.694730
      758.498289
    
    
      4
      0.954623
      1.395916
      0.670448
      712.923142
    
    
      5
      0.944261
      1.381780
      0.608631
      624.999101



In [38]:

    
stars_corr = df.corr()
stars_corr









    Out[38]:







  
    
      
      cool
      useful
      funny
      text length
    
  
  
    
      cool
      1.000000
      -0.743329
      -0.944939
      -0.857664
    
    
      useful
      -0.743329
      1.000000
      0.894506
      0.699881
    
    
      funny
      -0.944939
      0.894506
      1.000000
      0.843461
    
    
      text length
      -0.857664
      0.699881
      0.843461
      1.000000



In [39]:

    
sns.heatmap(stars_corr, annot=True)









    Out[39]:





<matplotlib.axes._subplots.AxesSubplot at 0x23aa1c120f0>



In [47]:

    
yelp_class = yelp.loc[yelp["stars"].isin([1, 5])]



In [53]:

    
yelp_class.describe()









    Out[53]:







  
    
      
      stars
      cool
      useful
      funny
      text length
    
  
  
    
      count
      4086.000000
      4086.000000
      4086.000000
      4086.000000
      4086.000000
    
    
      mean
      4.266765
      0.876897
      1.422663
      0.690651
      661.938815
    
    
      std
      1.547868
      2.336611
      2.598515
      1.961751
      601.621371
    
    
      min
      1.000000
      0.000000
      0.000000
      0.000000
      6.000000
    
    
      25%
      5.000000
      0.000000
      0.000000
      0.000000
      256.000000
    
    
      50%
      5.000000
      0.000000
      1.000000
      0.000000
      489.500000
    
    
      75%
      5.000000
      1.000000
      2.000000
      1.000000
      878.000000
    
    
      max
      5.000000
      77.000000
      76.000000
      39.000000
      4986.000000



In [54]:

    
X = yelp_class["text"]
y = yelp_class["stars"]



In [55]:

    
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()



In [56]:

    
X = cv.fit_transform(X)



In [57]:

    
from sklearn.model_selection import train_test_split



In [58]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3)



In [62]:

    
from sklearn.naive_bayes import MultinomialNB



In [63]:

    
nb = MultinomialNB()



In [64]:

    
nb.fit(X_train, y_train)









    Out[64]:





MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



In [67]:

    
pred = nb.predict(X_test)



In [68]:

    
from sklearn.metrics import confusion_matrix, classification_report



In [69]:

    
print(confusion_matrix(pred, y_test))
print("\n")
print(classification_report(pred, y_test))









    



[[144  28]
 [ 81 973]]


             precision    recall  f1-score   support

          1       0.64      0.84      0.73       172
          5       0.97      0.92      0.95      1054

avg / total       0.93      0.91      0.92      1226

Not bad!



In [70]:

    
from sklearn.feature_extraction.text import TfidfTransformer



In [71]:

    
from sklearn.pipeline import Pipeline



In [72]:

    
pipeline = Pipeline([
    ("cv", CountVectorizer()),
    ("Tfidf", TfidfTransformer()),
    ("mnb", MultinomialNB())
])



In [74]:

    
X = yelp_class["text"]
y = yelp_class["stars"]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3)



In [75]:

    
pipeline.fit(X_train, y_train)









    Out[75]:





Pipeline(steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_a...inear_tf=False, use_idf=True)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])



In [76]:

    
ppred = pipeline.predict(X_test)



In [77]:

    
print(confusion_matrix(ppred, y_test))
print("\n")
print(classification_report(ppred, y_test))









    



[[   1    0]
 [ 202 1023]]


             precision    recall  f1-score   support

          1       0.00      1.00      0.01         1
          5       1.00      0.84      0.91      1225

avg / total       1.00      0.84      0.91      1226

Slightly Worse!



In [ ]:

	business_id	date	review_id	stars	text	type	user_id	cool	useful
0	9yKzy9PApeiPPOUJEtnvkg	2011-01-26	fWKvX83p0-ka4JS3dc6E5A	5	My wife took me here on my birthday for breakf...	review	rLtl8ZkDX5vH5nAx9C3q5Q	2	5
1	ZRJwVLyzEJq1VAihDhYiow	2011-07-27	IjZ33sJrzXqU-0X6U8NwyA	5	I have no idea why some people give bad review...	review	0a2KyEL0d3Yb1V6aivbIuQ	0	0
2	6oRAC4uyJCsJl1X0WZpVSA	2012-06-14	IESLBzqUCLdSzSqm0eCSxQ	4	love the gyro plate. Rice is so good and I als...	review	0hT2KtfLiobPvh6cDC8JQg	0	1
3	_1QQZuf4zZOyFCvXc0o6Vg	2010-05-27	G-WvGaISbqqaMHlNnByodA	5	Rosie, Dakota, and I LOVE Chaparral Dog Park!!...	review	uZetl9T0NcROGOyFfughhg	1	2
4	6ozycU1RpktNG2-1BroVtw	2012-01-05	1uJFq2r5QfJG_6ExMRCaGw	5	General Manager Scott Petello is a good egg!!!...	review	vYmM4KTsC8ZfQBg-j5MWkw	0	0

	stars	cool	useful	funny
count	10000.000000	10000.000000	10000.000000	10000.000000
mean	3.777500	0.876800	1.409300	0.701300
std	1.214636	2.067861	2.336647	1.907942
min	1.000000	0.000000	0.000000	0.000000
25%	3.000000	0.000000	0.000000	0.000000
50%	4.000000	0.000000	1.000000	0.000000
75%	5.000000	1.000000	2.000000	1.000000
max	5.000000	77.000000	76.000000	57.000000

	cool	useful	funny	text length
stars
1	0.576769	1.604806	1.056075	826.515354
2	0.719525	1.563107	0.875944	842.256742
3	0.788501	1.306639	0.694730	758.498289
4	0.954623	1.395916	0.670448	712.923142
5	0.944261	1.381780	0.608631	624.999101

	cool	useful	funny	text length
cool	1.000000	-0.743329	-0.944939	-0.857664
useful	-0.743329	1.000000	0.894506	0.699881
funny	-0.944939	0.894506	1.000000	0.843461
text length	-0.857664	0.699881	0.843461	1.000000

	stars	cool	useful	funny	text length
count	4086.000000	4086.000000	4086.000000	4086.000000	4086.000000
mean	4.266765	0.876897	1.422663	0.690651	661.938815
std	1.547868	2.336611	2.598515	1.961751	601.621371
min	1.000000	0.000000	0.000000	0.000000	6.000000
25%	5.000000	0.000000	0.000000	0.000000	256.000000
50%	5.000000	0.000000	1.000000	0.000000	489.500000
75%	5.000000	1.000000	2.000000	1.000000	878.000000
max	5.000000	77.000000	76.000000	39.000000	4986.000000