In [1]:

    
%matplotlib inline

import os
import requests
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import csv
import sys


csv.field_size_limit(sys.maxsize)

reviews = "/Users/skhederian/restaurant-health/format_reviews.csv"
data = "/Users/skhederian/restaurant-health/the_final_countdown.csv"

dfr = pd.read_csv(reviews)
dfd = pd.read_csv(data)

Predict Violation Bin from Review Text



In [2]:

    
#Drop NaN Violations
dfreview = dfr.dropna(how = 'any').copy()
dfreview.shape









    Out[2]:





(20377, 5)



In [3]:

    
#Load our Pickle Classifier
from sklearn.externals import joblib

filename = 'class.pkl'
clf = joblib.load(filename)



In [4]:

    
predict = clf.predict(dfreview['string_agg'].values)
predict.shape









    Out[4]:





(20377,)



In [5]:

    
dfreview['predictions'] = predict



In [6]:

    
dfreview.head()









    Out[6]:






  
    
      
      business_id
      from_date
      thru_date
      violations
      string_agg
      predictions
    
  
  
    
      0
      --pOlFxITWnhzc7SHSIP0A
      2013-04-04
      2013-07-29
      9.0
      My group of four visited tonight for the first...
      Great
    
    
      1
      --pOlFxITWnhzc7SHSIP0A
      2013-07-30
      2013-10-29
      12.0
      My husband and I  recently ate dinner here for...
      Great
    
    
      2
      --pOlFxITWnhzc7SHSIP0A
      2013-10-30
      2014-03-19
      9.0
      Our waiter Chris was excellent!!!!!!!!!! My bu...
      Perfect
    
    
      3
      --pOlFxITWnhzc7SHSIP0A
      2014-03-20
      2014-06-11
      6.0
      I would like to Thank the staff at Block and G...
      Great
    
    
      4
      --pOlFxITWnhzc7SHSIP0A
      2014-06-12
      2014-09-17
      3.0
      Pork Belly appetizer was to die for!  A little...
      Perfect

Merge Data and Review DFs



In [7]:

    
df = dfreview.replace('2999-12-31','2017-01-01')



In [8]:

    
df.rename(columns = {'--business_id':'_id'}, inplace = True)



In [9]:

    
df['thru_date'] = pd.to_datetime(df['thru_date'], coerce=True)
dfd['inspection_date'] = pd.to_datetime(dfd['inspection_date'], coerce=True)
df['inspection_date'] = pd.DatetimeIndex(df['thru_date']) + pd.DateOffset(1)









    



//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: the coerce=True keyword is deprecated, use errors='coerce' instead
  if __name__ == '__main__':
//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: the coerce=True keyword is deprecated, use errors='coerce' instead
  from ipykernel import kernelapp as app



In [10]:

    
finaldf = pd.merge(dfd, df, on=['business_id', 'inspection_date'], how='inner')



In [11]:

    
finaldf.shape









    Out[11]:





(18879, 73)

Encode Predicted Violations Column



In [12]:

    
group_names = {'Perfect':5, 'Excellent':10, 'Great':15, 'Good':20, 'Bad':25, 'Very Bad':30, 'rats':35, 'Shutdown':40}

finaldf.predictions = finaldf.predictions.map(group_names)
finaldf.predictions.unique()









    Out[12]:





array([25, 20, 10,  5, 30, 15, 35, 40])



In [13]:

    
finaldf.to_csv('pickles.csv')

Now feed this into our regression models!

	business_id	from_date	thru_date	violations	string_agg	predictions
0	--pOlFxITWnhzc7SHSIP0A	2013-04-04	2013-07-29	9.0	My group of four visited tonight for the first...	Great
1	--pOlFxITWnhzc7SHSIP0A	2013-07-30	2013-10-29	12.0	My husband and I recently ate dinner here for...	Great
2	--pOlFxITWnhzc7SHSIP0A	2013-10-30	2014-03-19	9.0	Our waiter Chris was excellent!!!!!!!!!! My bu...	Perfect
3	--pOlFxITWnhzc7SHSIP0A	2014-03-20	2014-06-11	6.0	I would like to Thank the staff at Block and G...	Great
4	--pOlFxITWnhzc7SHSIP0A	2014-06-12	2014-09-17	3.0	Pork Belly appetizer was to die for! A little...	Perfect