In [36]:
%pylab inline
import pandas as pd
import json
def summarize(fname, title):
d = pd.DataFrame(json.loads(l) for l in open(fname))
counts = pd.DataFrame(d.GISJOIN.value_counts())
counts.columns = ['counts']
ax = counts['counts'].value_counts().plot(kind='barh')
ax.set_title(title)
ax.set_xscale('log')
ax.set_xlabel('number of geographic units')
ax.set_ylabel('Starbucks per unit')
summarize('sb_block.json', 'Frequency of starbucks counts for census blocks')
In [37]:
summarize('sb_track.json', 'Frequency of starbucks counts for census tracks')
For county data, there's a much greater distribution of starbucks counts.
In [38]:
summarize('sb_county.json', 'Frequency of starbucks counts for census counties')
import pylab
pylab.gcf().set_size_inches(6., 16.)