Natural Language Processing


In [1]:
import os
import requests
import pandas as pd 
import numpy as np
import csv
import sys

csv.field_size_limit(sys.maxsize)

reviews = "/Users/skhederian/restaurant-health/format_reviews.csv"
data = "/Users/skhederian/restaurant-health/the_final_countdown.csv"

dfr = pd.read_csv(reviews)
dfd = pd.read_csv(data)

In [2]:
#Drop NaN Violations
dfreview = dfr.dropna(how = 'any').copy()
dfreview.shape


Out[2]:
(20377, 5)

In [3]:
#Remove outliers that have high violation (> 4 standard deviations)
o = dfreview.copy()
odf = o[((o.violations - o.violations.mean()) / o.violations.std()).abs() < 4]
odf.shape


Out[3]:
(20248, 5)

In [4]:
#Create bins for violation counts
bins = [-1, 5, 10, 15, 20, 25, 30, 35, 40]
group_names = ['Perfect', 'Excellent', 'Great', 'Good', 'Bad', 'Very Bad', 'rats', 'Shutdown']

odf['violationoutcat'] = pd.cut(odf['violations'], bins, labels=group_names)
odf.head()
odf.shape


//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[4]:
(20248, 6)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
counts = cv.fit_transform(odf['string_agg'].values)
counts


Out[5]:
<20248x114008 sparse matrix of type '<class 'numpy.int64'>'
	with 7645234 stored elements in Compressed Sparse Row format>

In [6]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

targetout = odf.violationoutcat.values
classifier.fit(counts, targetout)


Out[6]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vectorizer',  cv),
    ('classifier',  classifier)])

In [8]:
from sklearn.cross_validation import KFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score


k_fold = KFold(n=len(odf), n_folds=12)
f1scores = []
ascores = []
rscores = []
pscores = []

for train_indices, test_indices in k_fold:
    train_text = odf.iloc[train_indices]['string_agg'].values
    train_y = odf.iloc[train_indices]['violationoutcat'].values

    test_text = odf.iloc[test_indices]['string_agg'].values
    test_y = odf.iloc[test_indices]['violationoutcat'].values

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    #scores
    f1score = f1_score(test_y, predictions, average='weighted')
    f1scores.append(f1score)
    accuracy = accuracy_score(test_y, predictions)
    ascores.append(accuracy)
    recall = recall_score(test_y, predictions, average='weighted')
    rscores.append(recall)
    precision = precision_score(test_y, predictions, average='weighted')
    pscores.append(precision)


//anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [9]:
print('Total reviews classified:', len(odf))
print('F1 Score:', sum(f1scores)/len(f1scores))
print ('Accuracy:', sum(ascores)/len(ascores))
print ('Recall:', sum(rscores)/len(rscores))
print ('Precision:', sum(pscores)/len(pscores))


Total reviews classified: 20248
F1 Score: 0.494298051694
Accuracy: 0.521337724781
Recall: 0.521337724781
Precision: 0.481683076583

In [10]:
#Create a Pickle
from sklearn.externals import joblib

joblib.dump(pipeline, 'class.pkl', compress=9)


Out[10]:
['class.pkl']

Text Features


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer='word', ngram_range=(1,3), min_df = .25, stop_words = 'english')
counts = cv.fit_transform(odf['string_agg'].values)

In [12]:
def most_informative_feature_for_class(cv, classifier, classlabel, n=30):
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = cv.get_feature_names()
    topn = sorted(zip(classifier.coef_[labelid], feature_names)) [-n:]
    
    for coef, feat in topn:
        print (classlabel, feat, coef)

In [13]:
most_informative_feature_for_class(cv, classifier, 'Perfect')


Perfect new -15.0071462471
Perfect perfect -15.0071462471
Perfect restaurant -15.0071462471
Perfect taste -15.0071462471
Perfect wait -15.0071462471
Perfect flavor -14.7840026958
Perfect people -14.7840026958
Perfect review -14.7840026958
Perfect large -14.601681139
Perfect time -14.601681139
Perfect want -14.3139990665
Perfect menu -14.0908555152
Perfect day -13.9955453354
Perfect came -13.620851886
Perfect minutes -13.3977083346
Perfect left -13.0261447782
Perfect ok -12.9594534037
Perfect especially -12.9277047054
Perfect just -12.8969330467
Perfect far -12.8670800836
Perfect quite -12.5222395973
Perfect place -12.4616149755
Perfect close -12.089375515
Perfect long -12.0496351863
Perfect thought -11.8936309379
Perfect actually -11.8501458259
Perfect decent -11.3371948029
Perfect think -9.31678679276
Perfect 10 -8.94103815698
Perfect things -7.34703192791

In [15]:
most_informative_feature_for_class(cv, classifier, 'Shutdown')


Shutdown table -11.680040317
Shutdown taste -11.680040317
Shutdown tasted -11.680040317
Shutdown tasty -11.680040317
Shutdown thing -11.680040317
Shutdown think -11.680040317
Shutdown thought -11.680040317
Shutdown times -11.680040317
Shutdown told -11.680040317
Shutdown took -11.680040317
Shutdown town -11.680040317
Shutdown tried -11.680040317
Shutdown try -11.680040317
Shutdown used -11.680040317
Shutdown usually -11.680040317
Shutdown ve -11.680040317
Shutdown visit -11.680040317
Shutdown wait -11.680040317
Shutdown want -11.680040317
Shutdown wanted -11.680040317
Shutdown wasn -11.680040317
Shutdown way -11.680040317
Shutdown went -11.680040317
Shutdown won -11.680040317
Shutdown work -11.680040317
Shutdown worth -11.680040317
Shutdown years -11.680040317
Shutdown 10 -10.9868931365
Shutdown time -10.9868931365
Shutdown things -9.88828084779

In [16]:
cv.get_feature_names()


Out[16]:
['10',
 'actually',
 'amazing',
 'area',
 'ask',
 'asked',
 'atmosphere',
 'away',
 'awesome',
 'bad',
 'bar',
 'best',
 'better',
 'big',
 'bit',
 'busy',
 'came',
 'check',
 'cheese',
 'chicken',
 'clean',
 'close',
 'come',
 'coming',
 'cooked',
 'couple',
 'customer',
 'day',
 'decent',
 'decided',
 'definitely',
 'delicious',
 'did',
 'didn',
 'different',
 'dinner',
 'disappointed',
 'don',
 'drink',
 'drinks',
 'eat',
 'eating',
 'especially',
 'excellent',
 'experience',
 'far',
 'fast',
 'favorite',
 'feel',
 'flavor',
 'food',
 'fresh',
 'fried',
 'friend',
 'friendly',
 'friends',
 'getting',
 'going',
 'good',
 'got',
 'great',
 'half',
 'happy',
 'hard',
 'home',
 'hot',
 'huge',
 'inside',
 'just',
 'kind',
 'know',
 'large',
 'left',
 'let',
 'like',
 'little',
 'll',
 'location',
 'long',
 'look',
 'looked',
 'looking',
 'lot',
 'love',
 'lunch',
 'make',
 'maybe',
 'meal',
 'meat',
 'menu',
 'minutes',
 'need',
 'new',
 'nice',
 'night',
 'ok',
 'open',
 'order',
 'ordered',
 'overall',
 'people',
 'perfect',
 'place',
 'places',
 'pretty',
 'pretty good',
 'price',
 'prices',
 'probably',
 'quality',
 'quick',
 'quite',
 'really',
 'really good',
 'recommend',
 'restaurant',
 'review',
 'rice',
 'right',
 'said',
 'salad',
 'sauce',
 'say',
 'served',
 'service',
 'small',
 'special',
 'spot',
 'staff',
 'star',
 'stars',
 'super',
 'sure',
 'sweet',
 'table',
 'taste',
 'tasted',
 'tasty',
 'thing',
 'things',
 'think',
 'thought',
 'time',
 'times',
 'told',
 'took',
 'town',
 'tried',
 'try',
 'used',
 'usually',
 've',
 'visit',
 'wait',
 'want',
 'wanted',
 'wasn',
 'way',
 'went',
 'won',
 'work',
 'worth',
 'years']

Visualization


In [17]:
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt
import random

from wordcloud import WordCloud

text = str(cv.get_feature_names())
wordcloud = WordCloud(font_path='/Library/Fonts/Verdana.ttf',
                      relative_scaling = 1.0,
                      stopwords = 'to of'
                      ).generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()