In [ ]:
%pylab --no-import-all inline

In [ ]:
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame, Series, Index
import pandas as pd

In [ ]:
# check that CENSUS_KEY is defined

import settings
assert settings.CENSUS_KEY is not None

The census documentation has example URLs but needs your API key to work. In this notebook, we'll use the IPython notebook HTML display mechanism to help out.


In [ ]:
# http://api.census.gov/data/2010/sf1/geo.html

In [ ]:
from IPython.core.display import HTML

In [ ]:
HTML("<iframe src='http://api.census.gov/data/2010/sf1/geo.html' width='800px'/>")

In [ ]:
%%HTML
<b>hi there</b>

In [ ]:
import urlparse
import urllib
from IPython.core.display import HTML

def add_census_key(url, api_key=settings.CENSUS_KEY):
    """Take an input example Census API call and a key parameter"""

    pr = urlparse.urlparse(url)
    
    # we're going to modify the query, which is the 5th element in the tuple (index 4)
    pr1 = list(pr)
    
    # convert pr.query from string to dict
    # see http://stackoverflow.com/a/10233141/7782 for meaning of doseq
    pr_query = urlparse.parse_qs(pr.query)
    pr_query["key"]= api_key

    pr1[4] = urllib.urlencode(pr_query, doseq=True)
    
    return urlparse.urlunparse(pr1)


def c_url (url, title=None, api_key=settings.CENSUS_KEY):
    url_with_key = add_census_key(url, api_key)
    if title is None:
        title = url
    return HTML("""<a href="{url}">{title}</a>""".format(url=url_with_key, title=title))

#add_census_key("http://api.census.gov/data/2010/sf1?get=P0010001&for=county:*")
c_url("http://api.census.gov/data/2010/sf1?get=NAME,P0010001&for=state:*")

Scraping the examples


In [ ]:
import requests
from lxml.html import parse, fromstring

url = "http://api.census.gov/data/2010/sf1/geo.html"
r = requests.get(url).content
doc = fromstring(r)

rows = doc.xpath("//table/tr")

# first row is the header

headers = [col.text for col in rows[0].findall('th')]
headers

# next rows are the census URL examples

In [ ]:
row = rows[1]
cols = row.findall('td')

# col[s0]:  Summmary Level

print cols[0].text

# cols[1]:  Description

print cols[1].text

In [ ]:
from itertools import islice
from lxml.html import parse

# let's actually now decorate the urls

def decorated_parse_examples(examples, api_key=settings.CENSUS_KEY):
    for row in examples:
        new_row = row.copy()
        # need to change URLs
        
        example_urls_col = new_row[headers[2]]
        #urls_with_key  = [add_census_key(url) for url in example_urls_col]
        
        new_row[headers[2]] = "<br/>".join(
            ["""<a href="{url_with_key}">{url}</a>""".format(
                url=url, 
                url_with_key=add_census_key(url)
                ) for url in example_urls_col
            ])
        
        yield new_row
        
def parse_urls_col(col):
    # http://stackoverflow.com/a/15074386/7782
    return [child for child in col.itertext()]

def parse_census_examples():

    url = "http://api.census.gov/data/2010/sf1/geo.html"
    doc = parse(url)

    rows = doc.xpath("//table/tr")

    # first row is the header

    headers = [col.text for col in rows[0].findall('th')]

    for row in rows[1:]:
        cols = row.findall('td')
        yield ({headers[0]:cols[0].text, 
                headers[1]:cols[1].text, 
                headers[2]:parse_urls_col(cols[2])})

#parsed_examples = list(islice(parse_census_examples(),None))
parsed_examples = parse_census_examples()

In [ ]:
# let's redisplay the table with 

from IPython.display import HTML
from jinja2 import Template

URLS_TEMPLATE= """
 <table>
   <tr>
   {% for header in headers %}
     <th>{{header}}</th>
   {% endfor %}
   </tr>
   {% for row in rows %}
   <tr>
    {% for header in headers %}
      <td>{{row[header]}}</td>
    {% endfor %}
   </tr>
   {% endfor %}
 </table>"""
    
template = Template(URLS_TEMPLATE)
HTML(template.render(headers=headers, rows=decorated_parse_examples(parsed_examples)))