In [1]:
import os
import requests
import pandas as pd
import numpy as np
import csv
import sys
csv.field_size_limit(sys.maxsize)
reviews = "/Users/skhederian/restaurant-health/format_reviews.csv"
data = "/Users/skhederian/restaurant-health/the_final_countdown.csv"
dfr = pd.read_csv(reviews)
dfd = pd.read_csv(data)
In [2]:
#Drop NaN Violations
dfreview = dfr.dropna(how = 'any').copy()
dfreview.shape
Out[2]:
In [3]:
#Remove outliers that have high violation (> 4 standard deviations)
o = dfreview.copy()
odf = o[((o.violations - o.violations.mean()) / o.violations.std()).abs() < 4]
odf.shape
Out[3]:
In [4]:
#Create bins for violation counts
bins = [-1, 5, 10, 15, 20, 25, 30, 35, 40]
group_names = ['Perfect', 'Excellent', 'Great', 'Good', 'Bad', 'Very Bad', 'rats', 'Shutdown']
odf['violationoutcat'] = pd.cut(odf['violations'], bins, labels=group_names)
odf.head()
odf.shape
Out[4]:
In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
counts = cv.fit_transform(odf['string_agg'].values)
counts
Out[5]:
In [6]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
targetout = odf.violationoutcat.values
classifier.fit(counts, targetout)
Out[6]:
In [7]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('vectorizer', cv),
('classifier', classifier)])
In [8]:
from sklearn.cross_validation import KFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
k_fold = KFold(n=len(odf), n_folds=12)
f1scores = []
ascores = []
rscores = []
pscores = []
for train_indices, test_indices in k_fold:
train_text = odf.iloc[train_indices]['string_agg'].values
train_y = odf.iloc[train_indices]['violationoutcat'].values
test_text = odf.iloc[test_indices]['string_agg'].values
test_y = odf.iloc[test_indices]['violationoutcat'].values
pipeline.fit(train_text, train_y)
predictions = pipeline.predict(test_text)
#scores
f1score = f1_score(test_y, predictions, average='weighted')
f1scores.append(f1score)
accuracy = accuracy_score(test_y, predictions)
ascores.append(accuracy)
recall = recall_score(test_y, predictions, average='weighted')
rscores.append(recall)
precision = precision_score(test_y, predictions, average='weighted')
pscores.append(precision)
In [9]:
print('Total reviews classified:', len(odf))
print('F1 Score:', sum(f1scores)/len(f1scores))
print ('Accuracy:', sum(ascores)/len(ascores))
print ('Recall:', sum(rscores)/len(rscores))
print ('Precision:', sum(pscores)/len(pscores))
In [10]:
#Create a Pickle
from sklearn.externals import joblib
joblib.dump(pipeline, 'class.pkl', compress=9)
Out[10]:
In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(analyzer='word', ngram_range=(1,3), min_df = .25, stop_words = 'english')
counts = cv.fit_transform(odf['string_agg'].values)
In [12]:
def most_informative_feature_for_class(cv, classifier, classlabel, n=30):
labelid = list(classifier.classes_).index(classlabel)
feature_names = cv.get_feature_names()
topn = sorted(zip(classifier.coef_[labelid], feature_names)) [-n:]
for coef, feat in topn:
print (classlabel, feat, coef)
In [13]:
most_informative_feature_for_class(cv, classifier, 'Perfect')
In [15]:
most_informative_feature_for_class(cv, classifier, 'Shutdown')
In [16]:
cv.get_feature_names()
Out[16]:
In [17]:
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt
import random
from wordcloud import WordCloud
text = str(cv.get_feature_names())
wordcloud = WordCloud(font_path='/Library/Fonts/Verdana.ttf',
relative_scaling = 1.0,
stopwords = 'to of'
).generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()