In [ ]:
    
import matplotlib
    
In [ ]:
    
%matplotlib inline
    
In [ ]:
    
#  import useful classes of pandas
import pandas as pd
from pandas import Series, DataFrame, Index
    
In [ ]:
    
import settings
import census
import requests
c = census.Census(key=settings.CENSUS_KEY)
    
In [ ]:
    
census.__version__
    
In [ ]:
    
c.sf1.fields
    
In [ ]:
    
sf1_fields = c.sf1.fields(year=2010)
    
In [ ]:
    
sorted(sf1_fields.keys())
    
Let's parse more of pieces that are in the fields
In [ ]:
    
# let's just parse sf1.xml ourselves to get the concepts
# http://lxml.de/parsing.html
from lxml import etree
from itertools import islice
import re
def parse_concept_name(concept_name):
    if concept_name != 'Geographic Characteristics':
        m = re.search("(.+\.)\s+(.*)\s+(\[\d+\])",concept_name)
        if m: 
           return {'label':m.group(1),
                  'clean_name':m.group(2),
                  'num_vars':re.search("\[(\d+)\]", m.group(3)).group(1)
                  }
       # print m.groups()
        else:
           m1 = re.search("(.+\.)\s+(.*)\s+",concept_name)
           return {'label':m1.group(1),
                  'clean_name':m1.group(2),
                  'num_vars':0
                 }
    else:
        return None
    
def concepts_2010_sf1():
    # http://www.census.gov/developers/data/sf1.xml
    SF1_XML_PATH  = "/Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data-2014/data/sf1.xml"
    doc = etree.parse(SF1_XML_PATH)
    for concept in doc.findall("//concept"):
        concept_name = concept.attrib['name']
        
        if concept_name != 'Geographic Characteristics':
            m = re.search("(.+\.)\s+(.*)\s+(\[\d+\])",concept_name)
            if m: 
               yield {'label':m.group(1),
                      'clean_name':m.group(2),
                      'num_vars':re.search("\[(\d+)\]", m.group(3)).group(1)
                      }
           # print m.groups()
            else:
               m1 = re.search("(.+\.)\s+(.*)\s+",concept_name)
               yield {'label':m1.group(1),
                      'clean_name':m1.group(2),
                      'num_vars':0
                     }
            
k = list(concepts_2010_sf1())    
k
    
In [ ]:
    
df = DataFrame(k, columns=('label','clean_name','num_vars'))
df.head()
    
In [ ]:
    
import re
def sort_label(label):
    (l1, l2, l3) = re.search("([A-Z,a-z]+)(\d+)([A-Z,a-z]*)\.",label).groups()
    return l1 + " " + "{l2:03d}".format(l2=int(l2)) + l3
df['sort_label'] = df.label.apply(sort_label)
    
In [ ]:
    
df[df.label.str.startswith("P5")]
    
In [ ]:
    
# let's go right for the variables and generate a dict, DF
from lxml import etree
from itertools import islice
from collections import OrderedDict
SF1_XML_PATH  = "/Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data-2014/data/sf1.xml"
doc = etree.parse(SF1_XML_PATH)
variables = doc.findall("//variable")
variables_dict = OrderedDict([(v.attrib['name'], 
                               {'concept':v.attrib['concept'],
                                'text': v.text
                                }) for v in variables])
    
In [ ]:
    
variables_dict['P0050001']
    
In [ ]:
    
def P005_range(n0,n1): 
    return tuple(('P005'+ "{i:04d}".format(i=i) for i in range(n0,n1)))
P005_vars = P005_range(1,18)
P005_vars_str = ",".join(P005_vars)
[(v,variables_dict[v]['text']) for v in P005_vars]
    
In [ ]:
    
variables_df = DataFrame(variables_dict)
variables_df.head()
    
In [ ]:
    
variables_df.T.concept.apply(parse_concept_name)
    
In [ ]:
    
parse_concept_name(variables_dict['P0050001']['concept'])
    
In [ ]:
    
# http://www.census.gov/developers/
import requests
url = "http://api.census.gov/data.json"
api_json = requests.get(url).json()
api_json
    
In [ ]:
    
len(api_json)
    
In [ ]:
    
api_json.keys()
    
In [ ]:
    
len(api_json.get('dataset'))
    
In [ ]:
    
df = DataFrame(api_json.get('dataset'))
df.columns
    
In [ ]:
    
# don't know why there's no 2010 census
df[df.title.apply(lambda s:'census' in s.lower())][['title']]
    
In [ ]:
    
df[['c_vintage', 'title']]
    
In [ ]:
    
# good way to see list of datasets
sorted(list(df['title'],))
    
In [ ]:
    
import requests
url = "http://api.census.gov/data/2010/sf1/variables.json"
var_json = requests.get(url).json()
sorted(var_json['variables'].keys())
    
In [ ]:
    
var_json['variables']['P0050002']
    
In [ ]:
    
from pandas import DataFrame
DataFrame(var_json['variables']).T
    
This example written by AJ Renold. (and rewritten by R. Yee to adapt to changes in the census API.)
In [ ]:
    
sf1_fields
    
In [ ]:
    
sf1_fields = c.sf1.fields(year=2010)
# Get the sf1 fields that are only P12 Sex By Age
gender_population_fields = sf1_fields.get('P12. Sex By Age [49]')
# Separate the by male and female
male_fields = { key: val for key, val in gender_population_fields.items() 
                                 if 'Male' in val and val != ' Male: ' }
female_fields = { key: val for key, val in gender_population_fields.items() 
                                   if 'Female' in val and val != ' Female: '}
    
In [ ]:
    
# Query the census API with the gender_population_fields
query_results = c.sf1.get(('NAME', ','.join(gender_population_fields.keys())), geo={'for': 'state:*'})
# Create a DataFrame
gender_df = pd.DataFrame(query_results)
    
In [ ]:
    
# Set the Index to the NAME column
gender_df = gender_df.set_index(gender_df['NAME'])
    
In [ ]:
    
# Recast all numeric columns to be type int
for col in gender_df.columns:
    if col != "state" and col != "NAME":
        gender_df[col] = gender_df[col].astype(int)
    
In [ ]:
    
from numpy import arange
def showPopulationPyramidPlot(df, state, male_fields, female_fields):
    
    # create a series with the row of the state
    s = Series(df.ix[state])
    #del s['NAME']
    #del s['state']
    
    # get the plot values and labels from the series
    male_list = sorted([ [key, s[key]] for key in s.keys() if key in male_fields ])
    female_list = sorted([ [key, s[key]] for key in s.keys() if key in female_fields ]) 
    
    # calculate the bar locations and the maximum value
    bar_ypos = arange(len(male_list))+.5
    max_val = max([ val for label, val in male_list + female_list ])
    
    # create the figures for the plots
    fig, (ax2, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(18,8))
    fig.suptitle('Population Age Pyramid for {state}'.format(state=state), fontsize=14)
    
    # plot the male populations
    bar1 = ax1.barh(bar_ypos, [ val for label, val in male_list ], align='center')
    ax1.set_xlim((0,max_val))
    ax1.set_yticks(bar_ypos)
    ax1.set_yticklabels([ male_fields[label][male_fields[label].find('!!')+3:] for label, val in male_list ])
    ax1.set_xlabel('People')
    ax1.set_title('Male Population by Age')
    ax1.grid(True)
    
    # plot the the female populations
    bar2 = ax2.barh(bar_ypos,[ val for label, val in female_list ], align='center', color='red')
    ax2.set_yticks([])
    #ax2.yaxis.tick_right()
    ax2.set_xlim(ax1.get_xlim()[::-1]) # reverses the x axis direction
    ax2.set_xlabel('People')
    ax2.set_title('Female Population by Age')
    ax2.grid(True)
    
    plt.subplots_adjust(wspace=0.22, hspace=0.0)
    plt.show()
    
In [ ]:
    
showPopulationPyramidPlot(gender_df, 'Illinois', male_fields, female_fields)
    
In [ ]: