Natural Language Processing

In [1]:
import os
import requests
import pandas as pd 
import numpy as np
import csv
import sys


reviews = "/Users/skhederian/restaurant-health/format_reviews.csv"
data = "/Users/skhederian/restaurant-health/the_final_countdown.csv"

dfr = pd.read_csv(reviews)
dfd = pd.read_csv(data)

In [2]:
#Drop NaN Violations
dfreview = dfr.dropna(how = 'any').copy()

(20377, 5)

In [3]:
#Remove outliers that have high violation (> 4 standard deviations)
o = dfreview.copy()
odf = o[((o.violations - o.violations.mean()) / o.violations.std()).abs() < 4]

(20248, 5)

In [4]:
#Create bins for violation counts
bins = [-1, 5, 10, 15, 20, 25, 30, 35, 40]
group_names = ['Perfect', 'Excellent', 'Great', 'Good', 'Bad', 'Very Bad', 'rats', 'Shutdown']

odf['violationoutcat'] = pd.cut(odf['violations'], bins, labels=group_names)

//anaconda/lib/python3.5/site-packages/ipykernel/ SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation:
(20248, 6)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
counts = cv.fit_transform(odf['string_agg'].values)

<20248x114008 sparse matrix of type '<class 'numpy.int64'>'
	with 7645234 stored elements in Compressed Sparse Row format>

In [6]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

targetout = odf.violationoutcat.values, targetout)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vectorizer',  cv),
    ('classifier',  classifier)])

In [8]:
from sklearn.cross_validation import KFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

k_fold = KFold(n=len(odf), n_folds=12)
f1scores = []
ascores = []
rscores = []
pscores = []

for train_indices, test_indices in k_fold:
    train_text = odf.iloc[train_indices]['string_agg'].values
    train_y = odf.iloc[train_indices]['violationoutcat'].values

    test_text = odf.iloc[test_indices]['string_agg'].values
    test_y = odf.iloc[test_indices]['violationoutcat'].values, train_y)
    predictions = pipeline.predict(test_text)

    f1score = f1_score(test_y, predictions, average='weighted')
    accuracy = accuracy_score(test_y, predictions)
    recall = recall_score(test_y, predictions, average='weighted')
    precision = precision_score(test_y, predictions, average='weighted')

In [9]:
print('Total reviews classified:', len(odf))
print('F1 Score:', sum(f1scores)/len(f1scores))
print ('Accuracy:', sum(ascores)/len(ascores))
print ('Recall:', sum(rscores)/len(rscores))
print ('Precision:', sum(pscores)/len(pscores))

Total reviews classified: 20248
F1 Score: 0.494298051694
Accuracy: 0.521337724781
Recall: 0.521337724781
Precision: 0.481683076583

In [10]:
#Create a Pickle
from sklearn.externals import joblib

joblib.dump(pipeline, 'class.pkl', compress=9)


Text Features

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer='word', ngram_range=(1,3), min_df = .25, stop_words = 'english')
counts = cv.fit_transform(odf['string_agg'].values)

In [12]:
def most_informative_feature_for_class(cv, classifier, classlabel, n=30):
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = cv.get_feature_names()
    topn = sorted(zip(classifier.coef_[labelid], feature_names)) [-n:]
    for coef, feat in topn:
        print (classlabel, feat, coef)

In [13]:
most_informative_feature_for_class(cv, classifier, 'Perfect')

Perfect new -15.0071462471
Perfect perfect -15.0071462471
Perfect restaurant -15.0071462471
Perfect taste -15.0071462471
Perfect wait -15.0071462471
Perfect flavor -14.7840026958
Perfect people -14.7840026958
Perfect review -14.7840026958
Perfect large -14.601681139
Perfect time -14.601681139
Perfect want -14.3139990665
Perfect menu -14.0908555152
Perfect day -13.9955453354
Perfect came -13.620851886
Perfect minutes -13.3977083346
Perfect left -13.0261447782
Perfect ok -12.9594534037
Perfect especially -12.9277047054
Perfect just -12.8969330467
Perfect far -12.8670800836
Perfect quite -12.5222395973
Perfect place -12.4616149755
Perfect close -12.089375515
Perfect long -12.0496351863
Perfect thought -11.8936309379
Perfect actually -11.8501458259
Perfect decent -11.3371948029
Perfect think -9.31678679276
Perfect 10 -8.94103815698
Perfect things -7.34703192791

In [15]:
most_informative_feature_for_class(cv, classifier, 'Shutdown')

Shutdown table -11.680040317
Shutdown taste -11.680040317
Shutdown tasted -11.680040317
Shutdown tasty -11.680040317
Shutdown thing -11.680040317
Shutdown think -11.680040317
Shutdown thought -11.680040317
Shutdown times -11.680040317
Shutdown told -11.680040317
Shutdown took -11.680040317
Shutdown town -11.680040317
Shutdown tried -11.680040317
Shutdown try -11.680040317
Shutdown used -11.680040317
Shutdown usually -11.680040317
Shutdown ve -11.680040317
Shutdown visit -11.680040317
Shutdown wait -11.680040317
Shutdown want -11.680040317
Shutdown wanted -11.680040317
Shutdown wasn -11.680040317
Shutdown way -11.680040317
Shutdown went -11.680040317
Shutdown won -11.680040317
Shutdown work -11.680040317
Shutdown worth -11.680040317
Shutdown years -11.680040317
Shutdown 10 -10.9868931365
Shutdown time -10.9868931365
Shutdown things -9.88828084779

In [17]:
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt
import random

from wordcloud import WordCloud

text = str(cv.get_feature_names())
wordcloud = WordCloud(font_path='/Library/Fonts/Verdana.ttf',
                      relative_scaling = 1.0,
                      stopwords = 'to of'