In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import os
import scipy as sp
from scipy import stats
from tools.plt import color2d #from the 'srcole/tools' repo
from matplotlib import cm
pd.options.display.max_rows = 1000
In [2]:
# Load cities info
df_cities = pd.read_csv('/gh/data2/yelp/city_pop.csv', index_col=0)
df_cities.head()
Out[2]:
In [3]:
# Load restaurants
df_restaurants = pd.read_csv('/gh/data2/yelp/food_by_city/df_restaurants.csv', index_col=0)
df_restaurants.head()
Out[3]:
In [4]:
if False:
# Load categories by restaurant
df_categories = pd.read_csv('/gh/data2/yelp/food_by_city/df_categories.csv', index_col=0)
# Save sparse categories df
df_categories.to_sparse(fill_value=0).to_pickle('/gh/data2/yelp/food_by_city/df_categories_sparse.pkl')
else:
# Load sparse categories df
df_categories = pd.read_pickle('/gh/data2/yelp/food_by_city/df_categories_sparse.pkl')
df_categories.head()
Out[4]:
In [5]:
# These are used for the 'category' input to the search function
df_categories_info = pd.read_json('/gh/data2/yelp/categories.json')
df_categories_info.head()
Out[5]:
In [6]:
from bokeh.io import output_notebook
from bokeh.layouts import row, widgetbox
from bokeh.models import CustomJS, Slider, Legend, HoverTool
from bokeh.plotting import figure, output_file, show, ColumnDataSource
output_notebook()
In [7]:
# New dataframe: For each cuisine, compute the average rating, average price, and # restaurants
all_cuisines = df_categories.keys()
cuisine_dict = {'cuisine': [],
'avg_rating': [],
'avg_cost': [],
'N': []}
for k in all_cuisines:
df_temp = df_restaurants[df_categories[k]==1]
cuisine_dict['cuisine'].append(k)
cuisine_dict['avg_rating'].append(df_temp['rating'].mean())
cuisine_dict['avg_cost'].append(df_temp['cost'].mean())
cuisine_dict['N'].append(len(df_temp))
df_cuisine = pd.DataFrame.from_dict(cuisine_dict)
In [8]:
# Slider variables
min_N_franchises = 1000
min_N_franchises_limit = 2000
# Determine dataframe sources
df_cuisine_limit = df_cuisine[df_cuisine['N'] > min_N_franchises_limit].reset_index()
# Limit cuisines to only those of interest by removing some
df_cuisine = df_cuisine[df_cuisine['N'] > min_N_franchises].reset_index()
cuisines_rmv = ['bars', 'beer_and_wine', 'beerbar', 'breweries', 'butcher', 'cafes', 'catering',
'chickenshop', 'cocktailbars', 'convenience', 'cosmetics', 'customcakes',
'deptstores', 'divebars', 'drugstores', 'eventplanning', 'farmersmarket', 'fooddeliveryservices',
'foodstands', 'gastropubs', 'gourmet', 'grocery', 'healthmarkets', 'importedfood', 'intlgrocery',
'karaoke', 'lounges', 'markets', 'meats', 'musicvenues', 'personalchefs', 'pubs',
'restaurants', 'salvadoran', 'seafoodmarkets', 'servicestations', 'sportsbars', 'streetvendors',
'tapasmallplates', 'venues', 'wine_bars', 'wineries']
cuisine_idx_drop = [i for i, rows in df_cuisine.iterrows() if rows['cuisine'] in cuisines_rmv]
df_cuisine.drop(cuisine_idx_drop, inplace=True)
df_cuisine.reset_index(drop=True, inplace=True)
In [9]:
# Create data source for plotting and Slider callback
if False:
source1 = ColumnDataSource(df_cuisine_limit, id='source1')
source2 = ColumnDataSource(df_cuisine, id='source2')
hover = HoverTool(tooltips=[
("Cuisine", "@cuisine"),
("Avg Stars", "@avg_rating"),
("Avg $", "@avg_cost"),
("# locations", "@N")])
# Make initial figure of net income vs years of saving
plot = figure(plot_width=400, plot_height=400,
x_axis_label='Average cost ($)',
y_axis_label='Average rating (stars)',
tools=[hover],
y_range=(2.5,5), x_range=(1,3))
plot.scatter('avg_cost', 'avg_rating', source=source1, line_width=3, line_alpha=0.6, line_color='black')
# Declare how to update plot on slider change
callback = CustomJS(args=dict(s1=source1, s2=source2), code="""
var d1 = s1.get("data");
var d2 = s2.get("data");
var N = N.value;
d1["cuisine"] = [];
d1["avg_rating"] = [];
d1["avg_cost"] = [];
d1["N"] = [];
for(i=0;i <=d2["N"].length; i++){
if (d2["N"][i] >= N) {
d1["cuisine"].push(d2["cuisine"][i]);
d1["avg_rating"].push(d2["avg_rating"][i]);
d1["avg_cost"].push(d2["avg_cost"][i]);
d1["N"].push(d2["N"][i]);
}
}
s1.change.emit();
""")
N_slider = Slider(start=min_N_franchises, end=10000, value=min_N_franchises_limit, step=100,
title="minimum number of restaurants", callback=callback)
callback.args["N"] = N_slider
# Define layout of plot and sliders
layout = row(plot, widgetbox(N_slider))
# Output and show
output_file("/gh/srcole.github.io/assets/misc/cuisine_bokeh.html", title="Cuisine WIP")
show(layout)
In [10]:
# Determine cuisines to rank
cuisines_keep = df_cuisine['cuisine'].unique()
# Set up dataframe for restaurants with categories of interest
restaurant_have_category = df_categories[cuisines_keep].sum(axis=1).to_dict()
df_restaurants_keep_idx = [k for k in restaurant_have_category.keys() if restaurant_have_category[k]]
df_restaurants_temp = df_restaurants.loc[df_restaurants_keep_idx].reset_index(drop=True)
df_categories_temp = df_categories[cuisines_keep].loc[df_restaurants_keep_idx].reset_index(drop=True)
df_restaurants_temp = df_restaurants_temp.merge(df_categories_temp, left_index=True, right_index=True)
# Get avg ratings for the different cuisines
In [11]:
# Get avg ratings for the different cuisines
# Not doing groupby because restaurants may have multiple cuisines
dict_cuisines = {'cuisine': [],
'rating_avg': [],
'rating_sem': [],
'Nreview_avg': [],
'Nreview_sem': [],
'cost_avg': [],
'cost_sem': [],
'N': []}
for k in cuisines_keep:
df_temp = df_restaurants_temp.loc[np.where(df_restaurants_temp[k])[0]]
dict_cuisines['cuisine'].append(k)
dict_cuisines['rating_avg'].append(np.mean(df_temp['rating']))
dict_cuisines['rating_sem'].append(sp.stats.sem(df_temp['rating']))
dict_cuisines['Nreview_avg'].append(np.mean(df_temp['review_count']))
dict_cuisines['Nreview_sem'].append(sp.stats.sem(df_temp['review_count']))
dict_cuisines['cost_avg'].append(np.mean(df_temp['cost']))
dict_cuisines['cost_sem'].append(sp.stats.sem(df_temp['cost']))
dict_cuisines['N'].append(len(df_temp))
# Rating dataframe
df_cuisines_rating = pd.DataFrame.from_dict(dict_cuisines)
df_cuisines_rating.sort_values('rating_avg', ascending=False, inplace=True)
df_cuisines_rating.reset_index(drop=True, inplace=True)
In [12]:
# Bar chart
N=20
plt.figure(figsize=(12,5))
plt.bar(np.arange(N), df_cuisines_rating['rating_avg'].values[:N], color='k', ecolor='.5')
plt.xticks(np.arange(N), df_cuisines_rating['cuisine'].values[:N])
plt.ylabel('Average rating', size=20)
plt.xlabel('Cuisine', size=20)
plt.xticks(size=15, rotation='vertical')
plt.yticks(size=15)
plt.ylim((3.5,4.5))
plt.xlim((-1, N))
Out[12]:
In [13]:
# Bar chart
N=60
plt.figure(figsize=(30,5))
plt.bar(np.arange(N), df_cuisines_rating['rating_avg'].values[:N], color='k', ecolor='.5')
plt.xticks(np.arange(N), df_cuisines_rating['cuisine'].values[:N])
plt.ylabel('Average Yelp rating', size=20)
plt.xlabel('Cuisine', size=20)
plt.xticks(size=15, rotation='vertical')
plt.yticks(size=15)
plt.ylim((3,4.5))
plt.xlim((-1, N))
Out[13]:
In [14]:
df_cuisines_rating.sort_values('cost_avg', ascending=False, inplace=True)
df_cuisines_rating.reset_index(drop=True, inplace=True)
N=60
plt.figure(figsize=(30,5))
plt.bar(np.arange(N), df_cuisines_rating['cost_avg'].values[:N], color='k', ecolor='.5')
plt.xticks(np.arange(N), df_cuisines_rating['cuisine'].values[:N])
plt.ylabel('Average Yelp cost ($)', size=20)
plt.xlabel('Cuisine', size=20)
plt.xticks(size=15, rotation='vertical')
plt.yticks(size=15)
plt.ylim((1,2.5))
plt.xlim((-1, N))
Out[14]:
In [15]:
df_cuisines_rating.sort_values('Nreview_avg', ascending=False, inplace=True)
df_cuisines_rating.reset_index(drop=True, inplace=True)
N=60
plt.figure(figsize=(30,5))
plt.bar(np.arange(N), df_cuisines_rating['Nreview_avg'].values[:N], color='k', ecolor='.5')
plt.xticks(np.arange(N), df_cuisines_rating['cuisine'].values[:N])
plt.ylabel('Average # Yelp reviews', size=20)
plt.xlabel('Cuisine', size=20)
plt.xticks(size=15, rotation='vertical')
plt.yticks(size=15)
plt.xlim((-1, N))
Out[15]:
In [16]:
df_cuisines_rating.sort_values('N', ascending=False, inplace=True)
df_cuisines_rating.reset_index(drop=True, inplace=True)
N=60
plt.figure(figsize=(30,5))
plt.bar(np.arange(N), df_cuisines_rating['N'].values[:N], color='k', ecolor='.5')
plt.xticks(np.arange(N), df_cuisines_rating['cuisine'].values[:N])
plt.ylabel('Total # restaurants', size=20)
plt.xlabel('Cuisine', size=20)
plt.xticks(size=15, rotation='vertical')
plt.yticks([10**3, 10**4, 10**5], size=15)
plt.ylim((10**3, 10**5))
plt.xlim((-1, N))
ax=plt.gca()
ax.set_yscale("log", nonposy='clip')