In [ ]:
import matplotlib
In [ ]:
%matplotlib inline
In [ ]:
# import useful classes of pandas
import pandas as pd
from pandas import Series, DataFrame, Index
In [ ]:
import settings
import census
import requests
c = census.Census(key=settings.CENSUS_KEY)
In [ ]:
census.__version__
In [ ]:
c.sf1.fields
In [ ]:
sf1_fields = c.sf1.fields(year=2010)
In [ ]:
sorted(sf1_fields.keys())
Let's parse more of pieces that are in the fields
In [ ]:
# let's just parse sf1.xml ourselves to get the concepts
# http://lxml.de/parsing.html
from lxml import etree
from itertools import islice
import re
def parse_concept_name(concept_name):
if concept_name != 'Geographic Characteristics':
m = re.search("(.+\.)\s+(.*)\s+(\[\d+\])",concept_name)
if m:
return {'label':m.group(1),
'clean_name':m.group(2),
'num_vars':re.search("\[(\d+)\]", m.group(3)).group(1)
}
# print m.groups()
else:
m1 = re.search("(.+\.)\s+(.*)\s+",concept_name)
return {'label':m1.group(1),
'clean_name':m1.group(2),
'num_vars':0
}
else:
return None
def concepts_2010_sf1():
# http://www.census.gov/developers/data/sf1.xml
SF1_XML_PATH = "/Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data-2014/data/sf1.xml"
doc = etree.parse(SF1_XML_PATH)
for concept in doc.findall("//concept"):
concept_name = concept.attrib['name']
if concept_name != 'Geographic Characteristics':
m = re.search("(.+\.)\s+(.*)\s+(\[\d+\])",concept_name)
if m:
yield {'label':m.group(1),
'clean_name':m.group(2),
'num_vars':re.search("\[(\d+)\]", m.group(3)).group(1)
}
# print m.groups()
else:
m1 = re.search("(.+\.)\s+(.*)\s+",concept_name)
yield {'label':m1.group(1),
'clean_name':m1.group(2),
'num_vars':0
}
k = list(concepts_2010_sf1())
k
In [ ]:
df = DataFrame(k, columns=('label','clean_name','num_vars'))
df.head()
In [ ]:
import re
def sort_label(label):
(l1, l2, l3) = re.search("([A-Z,a-z]+)(\d+)([A-Z,a-z]*)\.",label).groups()
return l1 + " " + "{l2:03d}".format(l2=int(l2)) + l3
df['sort_label'] = df.label.apply(sort_label)
In [ ]:
df[df.label.str.startswith("P5")]
In [ ]:
# let's go right for the variables and generate a dict, DF
from lxml import etree
from itertools import islice
from collections import OrderedDict
SF1_XML_PATH = "/Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data-2014/data/sf1.xml"
doc = etree.parse(SF1_XML_PATH)
variables = doc.findall("//variable")
variables_dict = OrderedDict([(v.attrib['name'],
{'concept':v.attrib['concept'],
'text': v.text
}) for v in variables])
In [ ]:
variables_dict['P0050001']
In [ ]:
def P005_range(n0,n1):
return tuple(('P005'+ "{i:04d}".format(i=i) for i in range(n0,n1)))
P005_vars = P005_range(1,18)
P005_vars_str = ",".join(P005_vars)
[(v,variables_dict[v]['text']) for v in P005_vars]
In [ ]:
variables_df = DataFrame(variables_dict)
variables_df.head()
In [ ]:
variables_df.T.concept.apply(parse_concept_name)
In [ ]:
parse_concept_name(variables_dict['P0050001']['concept'])
In [ ]:
# http://www.census.gov/developers/
import requests
url = "http://api.census.gov/data.json"
api_json = requests.get(url).json()
api_json
In [ ]:
len(api_json)
In [ ]:
api_json.keys()
In [ ]:
len(api_json.get('dataset'))
In [ ]:
df = DataFrame(api_json.get('dataset'))
df.columns
In [ ]:
# don't know why there's no 2010 census
df[df.title.apply(lambda s:'census' in s.lower())][['title']]
In [ ]:
df[['c_vintage', 'title']]
In [ ]:
# good way to see list of datasets
sorted(list(df['title'],))
In [ ]:
import requests
url = "http://api.census.gov/data/2010/sf1/variables.json"
var_json = requests.get(url).json()
sorted(var_json['variables'].keys())
In [ ]:
var_json['variables']['P0050002']
In [ ]:
from pandas import DataFrame
DataFrame(var_json['variables']).T
This example written by AJ Renold. (and rewritten by R. Yee to adapt to changes in the census API.)
In [ ]:
sf1_fields
In [ ]:
sf1_fields = c.sf1.fields(year=2010)
# Get the sf1 fields that are only P12 Sex By Age
gender_population_fields = sf1_fields.get('P12. Sex By Age [49]')
# Separate the by male and female
male_fields = { key: val for key, val in gender_population_fields.items()
if 'Male' in val and val != ' Male: ' }
female_fields = { key: val for key, val in gender_population_fields.items()
if 'Female' in val and val != ' Female: '}
In [ ]:
# Query the census API with the gender_population_fields
query_results = c.sf1.get(('NAME', ','.join(gender_population_fields.keys())), geo={'for': 'state:*'})
# Create a DataFrame
gender_df = pd.DataFrame(query_results)
In [ ]:
# Set the Index to the NAME column
gender_df = gender_df.set_index(gender_df['NAME'])
In [ ]:
# Recast all numeric columns to be type int
for col in gender_df.columns:
if col != "state" and col != "NAME":
gender_df[col] = gender_df[col].astype(int)
In [ ]:
from numpy import arange
def showPopulationPyramidPlot(df, state, male_fields, female_fields):
# create a series with the row of the state
s = Series(df.ix[state])
#del s['NAME']
#del s['state']
# get the plot values and labels from the series
male_list = sorted([ [key, s[key]] for key in s.keys() if key in male_fields ])
female_list = sorted([ [key, s[key]] for key in s.keys() if key in female_fields ])
# calculate the bar locations and the maximum value
bar_ypos = arange(len(male_list))+.5
max_val = max([ val for label, val in male_list + female_list ])
# create the figures for the plots
fig, (ax2, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(18,8))
fig.suptitle('Population Age Pyramid for {state}'.format(state=state), fontsize=14)
# plot the male populations
bar1 = ax1.barh(bar_ypos, [ val for label, val in male_list ], align='center')
ax1.set_xlim((0,max_val))
ax1.set_yticks(bar_ypos)
ax1.set_yticklabels([ male_fields[label][male_fields[label].find('!!')+3:] for label, val in male_list ])
ax1.set_xlabel('People')
ax1.set_title('Male Population by Age')
ax1.grid(True)
# plot the the female populations
bar2 = ax2.barh(bar_ypos,[ val for label, val in female_list ], align='center', color='red')
ax2.set_yticks([])
#ax2.yaxis.tick_right()
ax2.set_xlim(ax1.get_xlim()[::-1]) # reverses the x axis direction
ax2.set_xlabel('People')
ax2.set_title('Female Population by Age')
ax2.grid(True)
plt.subplots_adjust(wspace=0.22, hspace=0.0)
plt.show()
In [ ]:
showPopulationPyramidPlot(gender_df, 'Illinois', male_fields, female_fields)
In [ ]: