In [1]:
%matplotlib inline

import os
import requests
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import csv
import sys


csv.field_size_limit(sys.maxsize)

reviews = "/Users/skhederian/restaurant-health/format_reviews.csv"
data = "/Users/skhederian/restaurant-health/the_final_countdown.csv"

dfr = pd.read_csv(reviews)
dfd = pd.read_csv(data)

Predict Violation Bin from Review Text


In [2]:
#Drop NaN Violations
dfreview = dfr.dropna(how = 'any').copy()
dfreview.shape


Out[2]:
(20377, 5)

In [3]:
#Load our Pickle Classifier
from sklearn.externals import joblib

filename = 'class.pkl'
clf = joblib.load(filename)

In [4]:
predict = clf.predict(dfreview['string_agg'].values)
predict.shape


Out[4]:
(20377,)

In [5]:
dfreview['predictions'] = predict

In [6]:
dfreview.head()


Out[6]:
business_id from_date thru_date violations string_agg predictions
0 --pOlFxITWnhzc7SHSIP0A 2013-04-04 2013-07-29 9.0 My group of four visited tonight for the first... Great
1 --pOlFxITWnhzc7SHSIP0A 2013-07-30 2013-10-29 12.0 My husband and I recently ate dinner here for... Great
2 --pOlFxITWnhzc7SHSIP0A 2013-10-30 2014-03-19 9.0 Our waiter Chris was excellent!!!!!!!!!! My bu... Perfect
3 --pOlFxITWnhzc7SHSIP0A 2014-03-20 2014-06-11 6.0 I would like to Thank the staff at Block and G... Great
4 --pOlFxITWnhzc7SHSIP0A 2014-06-12 2014-09-17 3.0 Pork Belly appetizer was to die for! A little... Perfect

Merge Data and Review DFs


In [7]:
df = dfreview.replace('2999-12-31','2017-01-01')

In [8]:
df.rename(columns = {'--business_id':'_id'}, inplace = True)

In [9]:
df['thru_date'] = pd.to_datetime(df['thru_date'], coerce=True)
dfd['inspection_date'] = pd.to_datetime(dfd['inspection_date'], coerce=True)
df['inspection_date'] = pd.DatetimeIndex(df['thru_date']) + pd.DateOffset(1)


//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: the coerce=True keyword is deprecated, use errors='coerce' instead
  if __name__ == '__main__':
//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: the coerce=True keyword is deprecated, use errors='coerce' instead
  from ipykernel import kernelapp as app

In [10]:
finaldf = pd.merge(dfd, df, on=['business_id', 'inspection_date'], how='inner')

In [11]:
finaldf.shape


Out[11]:
(18879, 73)

Encode Predicted Violations Column


In [12]:
group_names = {'Perfect':5, 'Excellent':10, 'Great':15, 'Good':20, 'Bad':25, 'Very Bad':30, 'rats':35, 'Shutdown':40}

finaldf.predictions = finaldf.predictions.map(group_names)
finaldf.predictions.unique()


Out[12]:
array([25, 20, 10,  5, 30, 15, 35, 40])

In [13]:
finaldf.to_csv('pickles.csv')

Now feed this into our regression models!