Compute 3 files for use in later analysis:
stars_by_geocat_SCALE_THRESH.csv Average reviews by category by GISJOIN.
qstars_by_geocat_SCALE_THRESH.csv Average reviews by category by GISJOIN, weighted by useful score.
counts_by_geocat_SCALE_THRESH.csv Total review counts and review counts as fraction of total by category by GISJOIN.
where SCALE is the geographic scale (e.g. track, county), which is set as a variable when running the notebook. THRESH is the minimum number of non-missing values needed to keep a category.
In [82]:
%pylab inline
scale = 'tract'
thresh = 99 # Minimum number of non-missing per category
import pandas, json
reviews = pandas.DataFrame(json.loads(l) for l in open('yelp/review.json'))
In [83]:
reviews['useful'] = reviews['votes'].apply(lambda v: v['useful'])
In [84]:
ytract = pandas.DataFrame(json.loads(l) for l in open('business_'+scale+'.json'))
business = pandas.DataFrame(json.loads(l) for l in open('yelp/business.json'))
In [85]:
cats = []
for d in (json.loads(l) for l in open('yelp/business.json')):
for c in d['categories']:
cats.append(dict(business_id = d['business_id'], category=c))
cats = pandas.DataFrame(cats)
cats = cats.merge(ytract)
In [86]:
review_cats = reviews[['business_id', 'stars', 'useful']].merge(cats)
In [87]:
review_cats['wstars'] = review_cats.stars * (review_cats.useful + 1)
In [88]:
stars_by_geocat = review_cats[['stars']].groupby([review_cats.category, review_cats.GISJOIN]).mean()
stars_by_geocat = stars_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=thresh)
stars_by_geocat.columns = stars_by_geocat.columns.get_level_values(1)
stars_by_geocat.reset_index(inplace=True)
stars_by_geocat.to_csv('stars_by_geocat_%s_%s.csv' % (scale, thresh))
wstars_by_geocat = review_cats[['wstars']].groupby([review_cats.category, review_cats.GISJOIN]).mean()
wstars_by_geocat = wstars_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=thresh)
wstars_by_geocat.columns = wstars_by_geocat.columns.get_level_values(1)
wstars_by_geocat.reset_index(inplace=True)
wstars_by_geocat.to_csv('wstars_by_geocat_%s_%s.csv' % (scale, thresh))
In [89]:
gisjoin_counts = reviews[['business_id']].merge(ytract)
gisjoin_counts = (gisjoin_counts[['business_id']]
.groupby(gisjoin_counts.GISJOIN)
.count()
.reset_index()
.rename(columns=dict(business_id = 'TotalReviews'))
)
In [90]:
counts_by_geocat = cats[['business_id']].groupby([cats.category, cats.GISJOIN]).count()
counts_by_geocat.columns = ['business_count']
counts_by_geocat = counts_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=thresh)
counts_by_geocat.columns = counts_by_geocat.columns.get_level_values(1)
counts_by_geocat.reset_index(inplace=True)
counts_by_geocat = counts_by_geocat.merge(gisjoin_counts)
counts_by_geocat = pandas.concat(
(counts_by_geocat[['GISJOIN', 'TotalReviews']],
counts_by_geocat[counts_by_geocat.columns[1:-1]]
.div(counts_by_geocat['TotalReviews'], axis=0)
), axis=1)
counts_by_geocat.to_csv('counts_by_geocat_%s_%s.csv' % (scale, thresh))
counts_by_geocat.describe()
Out[90]:
In [ ]: