Natural Language Processing


In [1]:
import os
import requests
import pandas as pd 
import numpy as np
import csv
import sys


csv.field_size_limit(sys.maxsize)

reviews = "/Users/skhederian/restaurant-health/format_reviews.csv"
data = "/Users/skhederian/restaurant-health/the_final_countdown.csv"

dfr = pd.read_csv(reviews)
dfd = pd.read_csv(data)

In [2]:
#Drop NaN Violations
dfreview = dfr.dropna(how = 'any').copy()
dfreview.shape


Out[2]:
(20377, 5)

In [3]:
#Create bins for violation counts
bins = [-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
group_names = ['Perfect', 'Excellent', 'Great', 'Good', 'OK', 'Nil', 'Bad', 'Very Bad', 'RATS', 'Shutdown']

dfreview['violationcat'] = pd.cut(dfreview['violations'], bins, labels=group_names)
dfreview.head()
dfreview.shape


Out[3]:
(20377, 6)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer='word', ngram_range=(1,3), min_df = .25, stop_words = 'english')

counts = cv.fit_transform(dfreview['string_agg'].values)
counts


Out[4]:
<20377x162 sparse matrix of type '<class 'numpy.int64'>'
	with 1212278 stored elements in Compressed Sparse Row format>

In [5]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()

In [6]:
targets = dfreview['violationcat'].values
classifier.fit(counts, targets)


Out[6]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vectorizer',  cv),
    ('classifier',  classifier) ])

In [11]:
from sklearn.cross_validation import KFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score


k_fold = KFold(n=len(dfreview), n_folds=12)
f1scores = []
ascores = []
rscores = []
pscores = []

for train_indices, test_indices in k_fold:
    train_text = dfreview.iloc[train_indices]['string_agg'].values
    train_y = dfreview.iloc[train_indices]['violationcat'].values

    test_text = dfreview.iloc[test_indices]['string_agg'].values
    test_y = dfreview.iloc[test_indices]['violationcat'].values

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    #scores
    f1score = f1_score(test_y, predictions, average='weighted')
    f1scores.append(f1score)
    accuracy = accuracy_score(test_y, predictions)
    ascores.append(accuracy)
    recall = recall_score(test_y, predictions, average='weighted')
    rscores.append(recall)
    precision = precision_score(test_y, predictions, average='weighted')
    pscores.append(precision)


//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
//anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [12]:
print('Total reviews classified:', len(dfreview))
print('F1 Score:', sum(f1scores)/len(f1scores))
print ('Accuracy:', sum(ascores)/len(ascores))
print ('Recall:', sum(rscores)/len(rscores))
print ('Precision:', sum(pscores)/len(pscores))


Total reviews classified: 20377
F1 Score: 0.757716108971
Accuracy: 0.738431900517
Recall: 0.738431900517
Precision: 0.778954234524