In [66]:
import pandas as pd
import numpy as np
In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
%matplotlib inline
In [4]:
yelp = pd.read_csv("yelp.csv")
In [5]:
yelp.head()
Out[5]:
In [6]:
yelp.describe()
Out[6]:
In [7]:
yelp.info()
In [8]:
yelp["text length"] = yelp["text"].apply(len)
In [9]:
yelp.head()
Out[9]:
In [10]:
g = sns.FacetGrid(yelp,col='stars')
g.map(plt.hist,'text length')
Out[10]:
In [13]:
sns.boxplot(x = "stars", y = "text length", data = yelp)
Out[13]:
In [14]:
sns.countplot("stars", data=yelp)
Out[14]:
In [35]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df = yelp.groupby("stars").mean()
df
Out[35]:
In [38]:
stars_corr = df.corr()
stars_corr
Out[38]:
In [39]:
sns.heatmap(stars_corr, annot=True)
Out[39]:
In [47]:
yelp_class = yelp.loc[yelp["stars"].isin([1, 5])]
In [53]:
yelp_class.describe()
Out[53]:
In [54]:
X = yelp_class["text"]
y = yelp_class["stars"]
In [55]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
In [56]:
X = cv.fit_transform(X)
In [57]:
from sklearn.model_selection import train_test_split
In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3)
In [62]:
from sklearn.naive_bayes import MultinomialNB
In [63]:
nb = MultinomialNB()
In [64]:
nb.fit(X_train, y_train)
Out[64]:
In [67]:
pred = nb.predict(X_test)
In [68]:
from sklearn.metrics import confusion_matrix, classification_report
In [69]:
print(confusion_matrix(pred, y_test))
print("\n")
print(classification_report(pred, y_test))
In [70]:
from sklearn.feature_extraction.text import TfidfTransformer
In [71]:
from sklearn.pipeline import Pipeline
In [72]:
pipeline = Pipeline([
("cv", CountVectorizer()),
("Tfidf", TfidfTransformer()),
("mnb", MultinomialNB())
])
In [74]:
X = yelp_class["text"]
y = yelp_class["stars"]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3)
In [75]:
pipeline.fit(X_train, y_train)
Out[75]:
In [76]:
ppred = pipeline.predict(X_test)
In [77]:
print(confusion_matrix(ppred, y_test))
print("\n")
print(classification_report(ppred, y_test))
In [ ]: