Compute yelp review ratings and counts by business category by geo

Compute 3 files for use in later analysis:

stars_by_geocat_SCALE_THRESH.csv Average reviews by category by GISJOIN.

qstars_by_geocat_SCALE_THRESH.csv Average reviews by category by GISJOIN, weighted by useful score.

counts_by_geocat_SCALE_THRESH.csv Total review counts and review counts as fraction of total by category by GISJOIN.

where SCALE is the geographic scale (e.g. track, county), which is set as a variable when running the notebook. THRESH is the minimum number of non-missing values needed to keep a category.



In [82]:

    
%pylab inline

scale = 'tract'
thresh = 99 # Minimum number of non-missing per category

import pandas, json
reviews = pandas.DataFrame(json.loads(l) for l in open('yelp/review.json'))









    



Populating the interactive namespace from numpy and matplotlib



In [83]:

    
reviews['useful'] = reviews['votes'].apply(lambda v: v['useful'])



In [84]:

    
ytract = pandas.DataFrame(json.loads(l) for l in open('business_'+scale+'.json'))
business = pandas.DataFrame(json.loads(l) for l in open('yelp/business.json'))



In [85]:

    
cats = []
for d in (json.loads(l) for l in open('yelp/business.json')):
    for c in d['categories']:
        cats.append(dict(business_id = d['business_id'], category=c))
cats = pandas.DataFrame(cats)
cats = cats.merge(ytract)



In [86]:

    
review_cats = reviews[['business_id', 'stars', 'useful']].merge(cats)



In [87]:

    
review_cats['wstars'] = review_cats.stars * (review_cats.useful + 1)



In [88]:

    
stars_by_geocat = review_cats[['stars']].groupby([review_cats.category, review_cats.GISJOIN]).mean()
stars_by_geocat = stars_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=thresh)
stars_by_geocat.columns = stars_by_geocat.columns.get_level_values(1)
stars_by_geocat.reset_index(inplace=True)
stars_by_geocat.to_csv('stars_by_geocat_%s_%s.csv' % (scale, thresh))

wstars_by_geocat = review_cats[['wstars']].groupby([review_cats.category, review_cats.GISJOIN]).mean()
wstars_by_geocat = wstars_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=thresh)
wstars_by_geocat.columns = wstars_by_geocat.columns.get_level_values(1)
wstars_by_geocat.reset_index(inplace=True)
wstars_by_geocat.to_csv('wstars_by_geocat_%s_%s.csv' % (scale, thresh))



In [89]:

    
gisjoin_counts = reviews[['business_id']].merge(ytract)
gisjoin_counts = (gisjoin_counts[['business_id']]
                  .groupby(gisjoin_counts.GISJOIN)
                  .count()
                  .reset_index()
                  .rename(columns=dict(business_id = 'TotalReviews'))
                 )



In [90]:

    
counts_by_geocat = cats[['business_id']].groupby([cats.category, cats.GISJOIN]).count()
counts_by_geocat.columns = ['business_count']
counts_by_geocat = counts_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=thresh)
counts_by_geocat.columns = counts_by_geocat.columns.get_level_values(1)
counts_by_geocat.reset_index(inplace=True)
counts_by_geocat = counts_by_geocat.merge(gisjoin_counts)
counts_by_geocat = pandas.concat(
    (counts_by_geocat[['GISJOIN', 'TotalReviews']],
     counts_by_geocat[counts_by_geocat.columns[1:-1]]
        .div(counts_by_geocat['TotalReviews'], axis=0)
    ), axis=1)
counts_by_geocat.to_csv('counts_by_geocat_%s_%s.csv' % (scale, thresh))
counts_by_geocat.describe()









    Out[90]:






  
    
      
      TotalReviews
      Accessories
      Active Life
      American (New)
      American (Traditional)
      Apartments
      Appliances
      Appliances & Repair
      Arts & Crafts
      Arts & Entertainment
      ...
      Urgent Care
      Used, Vintage & Consignment
      Vape Shops
      Vegetarian
      Venues & Event Spaces
      Veterinarians
      Vietnamese
      Wine Bars
      Women's Clothing
      Yoga
    
  
  
    
      count
      1942.000000
      172.000000
      956.000000
      613.000000
      838.000000
      326.000000
      119.000000
      100.000000
      264.000000
      526.000000
      ...
      127.000000
      133.000000
      110.000000
      150.000000
      225.000000
      335.000000
      127.000000
      131.000000
      268.000000
      153.000000
    
    
      mean
      767.570546
      0.001617
      0.014804
      0.005619
      0.007275
      0.008600
      0.003122
      0.006783
      0.003433
      0.007896
      ...
      0.004079
      0.002155
      0.002459
      0.001713
      0.009421
      0.006857
      0.002290
      0.001610
      0.002191
      0.002902
    
    
      std
      4490.666066
      0.002477
      0.035192
      0.012745
      0.019041
      0.028363
      0.004444
      0.018197
      0.008720
      0.022892
      ...
      0.009887
      0.003696
      0.008339
      0.002792
      0.037087
      0.020490
      0.002933
      0.002491
      0.003322
      0.006197
    
    
      min
      1.000000
      0.000078
      0.000141
      0.000153
      0.000153
      0.000011
      0.000137
      0.000078
      0.000030
      0.000140
      ...
      0.000031
      0.000028
      0.000031
      0.000011
      0.000057
      0.000062
      0.000011
      0.000028
      0.000099
      0.000006
    
    
      25%
      50.000000
      0.000500
      0.001771
      0.001163
      0.001459
      0.000697
      0.000561
      0.000645
      0.000514
      0.001066
      ...
      0.000587
      0.000463
      0.000351
      0.000374
      0.000645
      0.000868
      0.000515
      0.000401
      0.000629
      0.000425
    
    
      50%
      180.000000
      0.000957
      0.004057
      0.002384
      0.003040
      0.001458
      0.001271
      0.001405
      0.001074
      0.002427
      ...
      0.001166
      0.000906
      0.000791
      0.000723
      0.001383
      0.001953
      0.001088
      0.000880
      0.001176
      0.000843
    
    
      75%
      618.000000
      0.001842
      0.010989
      0.005181
      0.006612
      0.004204
      0.003891
      0.004211
      0.002499
      0.005823
      ...
      0.003320
      0.002022
      0.001598
      0.001701
      0.005181
      0.004728
      0.003301
      0.001511
      0.002410
      0.002358
    
    
      max
      181002.000000
      0.020408
      0.400000
      0.166667
      0.333333
      0.333333
      0.030303
      0.125000
      0.083333
      0.333333
      ...
      0.071429
      0.027027
      0.083333
      0.022222
      0.333333
      0.250000
      0.018519
      0.014925
      0.033333
      0.047619
    
  

8 rows × 182 columns



In [ ]:

	TotalReviews	Accessories	Active Life	American (New)	American (Traditional)	Apartments	Appliances	Appliances & Repair	Arts & Crafts	Arts & Entertainment	...	Urgent Care	Used, Vintage & Consignment	Vape Shops	Vegetarian	Venues & Event Spaces	Veterinarians	Vietnamese	Wine Bars	Women's Clothing	Yoga
count	1942.000000	172.000000	956.000000	613.000000	838.000000	326.000000	119.000000	100.000000	264.000000	526.000000	...	127.000000	133.000000	110.000000	150.000000	225.000000	335.000000	127.000000	131.000000	268.000000	153.000000
mean	767.570546	0.001617	0.014804	0.005619	0.007275	0.008600	0.003122	0.006783	0.003433	0.007896	...	0.004079	0.002155	0.002459	0.001713	0.009421	0.006857	0.002290	0.001610	0.002191	0.002902
std	4490.666066	0.002477	0.035192	0.012745	0.019041	0.028363	0.004444	0.018197	0.008720	0.022892	...	0.009887	0.003696	0.008339	0.002792	0.037087	0.020490	0.002933	0.002491	0.003322	0.006197
min	1.000000	0.000078	0.000141	0.000153	0.000153	0.000011	0.000137	0.000078	0.000030	0.000140	...	0.000031	0.000028	0.000031	0.000011	0.000057	0.000062	0.000011	0.000028	0.000099	0.000006
25%	50.000000	0.000500	0.001771	0.001163	0.001459	0.000697	0.000561	0.000645	0.000514	0.001066	...	0.000587	0.000463	0.000351	0.000374	0.000645	0.000868	0.000515	0.000401	0.000629	0.000425
50%	180.000000	0.000957	0.004057	0.002384	0.003040	0.001458	0.001271	0.001405	0.001074	0.002427	...	0.001166	0.000906	0.000791	0.000723	0.001383	0.001953	0.001088	0.000880	0.001176	0.000843
75%	618.000000	0.001842	0.010989	0.005181	0.006612	0.004204	0.003891	0.004211	0.002499	0.005823	...	0.003320	0.002022	0.001598	0.001701	0.005181	0.004728	0.003301	0.001511	0.002410	0.002358
max	181002.000000	0.020408	0.400000	0.166667	0.333333	0.333333	0.030303	0.125000	0.083333	0.333333	...	0.071429	0.027027	0.083333	0.022222	0.333333	0.250000	0.018519	0.014925	0.033333	0.047619