In [1]:
%matplotlib inline
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import csv
import sys
csv.field_size_limit(sys.maxsize)
reviews = "/Users/skhederian/restaurant-health/format_reviews.csv"
data = "/Users/skhederian/restaurant-health/the_final_countdown.csv"
dfr = pd.read_csv(reviews)
dfd = pd.read_csv(data)
In [2]:
#Drop NaN Violations
dfreview = dfr.dropna(how = 'any').copy()
dfreview.shape
Out[2]:
In [3]:
#Load our Pickle Classifier
from sklearn.externals import joblib
filename = 'class.pkl'
clf = joblib.load(filename)
In [4]:
predict = clf.predict(dfreview['string_agg'].values)
predict.shape
Out[4]:
In [5]:
dfreview['predictions'] = predict
In [6]:
dfreview.head()
Out[6]:
In [7]:
df = dfreview.replace('2999-12-31','2017-01-01')
In [8]:
df.rename(columns = {'--business_id':'_id'}, inplace = True)
In [9]:
df['thru_date'] = pd.to_datetime(df['thru_date'], coerce=True)
dfd['inspection_date'] = pd.to_datetime(dfd['inspection_date'], coerce=True)
df['inspection_date'] = pd.DatetimeIndex(df['thru_date']) + pd.DateOffset(1)
In [10]:
finaldf = pd.merge(dfd, df, on=['business_id', 'inspection_date'], how='inner')
In [11]:
finaldf.shape
Out[11]:
In [12]:
group_names = {'Perfect':5, 'Excellent':10, 'Great':15, 'Good':20, 'Bad':25, 'Very Bad':30, 'rats':35, 'Shutdown':40}
finaldf.predictions = finaldf.predictions.map(group_names)
finaldf.predictions.unique()
Out[12]:
In [13]:
finaldf.to_csv('pickles.csv')
Now feed this into our regression models!