In [1]:
%pylab inline
import pandas, json
reviews = pandas.DataFrame(json.loads(l) for l in open('yelp/review.json'))
In [2]:
reviews['useful'] = reviews['votes'].apply(lambda v: v['useful'])
In [3]:
ytract = pandas.DataFrame(json.loads(l) for l in open('business_track.json'))
business = pandas.DataFrame(json.loads(l) for l in open('yelp/business.json'))
In [ ]:
cats = []
for d in (json.loads(l) for l in open('yelp/business.json')):
for c in d['categories']:
cats.append(dict(business_id = d['business_id'], category=c))
cats = pandas.DataFrame(cats)
In [ ]:
cats = cats.merge(ytract)
counts_by_geocat.to_csv('counts_by_geocat.csv')
In [ ]:
review_cats = reviews[['business_id', 'stars', 'useful']].merge(cats)
In [ ]:
review_cats['wstars'] = review_cats.stars * (review_cats.useful + 1)
In [ ]:
In [ ]:
stars_by_geocat = review_cats[['stars']].groupby([review_cats.category, review_cats.GISJOIN]).mean()
stars_by_geocat = stars_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=99)
stars_by_geocat.columns = stars_by_geocat.columns.get_level_values(1)
stars_by_geocat.reset_index(inplace=True)
stars_by_geocat.to_csv('stars_by_geocat.csv')
wstars_by_geocat = review_cats[['wstars']].groupby([review_cats.category, review_cats.GISJOIN]).mean()
wstars_by_geocat = wstars_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=99)
wstars_by_geocat.columns = wstars_by_geocat.columns.get_level_values(1)
wstars_by_geocat.reset_index(inplace=True)
wstars_by_geocat.to_csv('wstars_by_geocat.csv')
counts_by_geocat = cats[['business_id']].groupby([cats.category, cats.GISJOIN]).count()
counts_by_geocat.columns = ['business_count']
counts_by_geocat = counts_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=99)
counts_by_geocat.columns = counts_by_geocat.columns.get_level_values(1)
counts_by_geocat.reset_index(inplace=True)
counts_by_geocat.to_csv('counts_by_geocat.csv')
In [ ]:
data = pandas.read_csv('zip/rus2/nhgis0002_ds201_20135_2013_tract.csv', skiprows=[1])
In [ ]:
data['pc_income'] = data['UJAE001']
data['employed_percent'] = data['UJ8E002'] / data['UJ8E001']
data['median_age'] = data['UEFE001']
data['median_gross_rent'] = data['UL9E001']
data['median_family_income'] = data['UINE001']
econ = data[['GISJOIN', 'pc_income', 'employed_percent', 'median_age', 'median_gross_rent', 'median_family_income']]
In [ ]:
wcorrs = econ.merge(wstars_by_geocat).corr().unstack().reset_index()
wcorrs.columns = ['econ', 'cat', 'r']
wcorrs = wcorrs[wcorrs.econ.apply(lambda e: e.islower())]
wcorrs = wcorrs[wcorrs.cat.apply(lambda e: not e.islower())]
wcorrs[wcorrs.r.apply(lambda v: abs(v) > .3)]
In [ ]:
scorrs = econ.merge(stars_by_geocat).corr().unstack().reset_index()
scorrs.columns = ['econ', 'cat', 'r']
scorrs = scorrs[scorrs.econ.apply(lambda e: e.islower())]
scorrs = scorrs[scorrs.cat.apply(lambda e: not e.islower())]
scorrs[scorrs.r.apply(lambda v: abs(v) > .3)]
In [ ]:
ccorrs = econ.merge(counts_by_geocat).corr().unstack().reset_index()
ccorrs.columns = ['econ', 'cat', 'r']
ccorrs = ccorrs[ccorrs.econ.apply(lambda e: e.islower())]
ccorrs = ccorrs[ccorrs.cat.apply(lambda e: not e.islower())]
ccorrs[ccorrs.r.apply(lambda v: abs(v) > .3)]
In [ ]: