Main File

This notebook is used for gathering and processing census files at the block group level.


In [40]:
import os.path
from census import Census
from us import states
import requests
import geopandas as gpd
import pandas as pd
import zipfile

# Specify state and county to download (select one)
loc_name, state_code, county_codes = "balt_city", states.MD.fips, list([510]) # Baltimore
# loc_name, state_code, county_codes = "greater_balt", states.MD.fips, list([510, 5]) # Baltimore City and County
# loc_name, state_code, county_codes = "new_orleans", states.LA.fips, list([71]) # New Orleans
#loc_name, state_code, county_codes = "greater_new_york", states.NY.fips, list([5, 47, 61, 81]) # Bronx, Kings, NY, Queens 
#loc_name, state_code, county_codes = "new_york", states.NY.fips, list([61]) # Bronx, Kings, NY, Queens 

# Create county list (string representation of county IDs)
county_list = ["{:03d}".format(county_id) for county_id in county_codes]

# CENSUS API Stuff
CENSUS_API = #YourAPIKey
c = Census(CENSUS_API) # Initialize census class with API key

# Generate codes for census variables of interest
var_ids = ["B19001_0{:02d}E".format(x) for x in range(2, 18)] # Household income over 12 months

# TIGER Stuff
TIGER_BASE_URL = 'http://www2.census.gov/geo/tiger/TIGER2013/'
TIGER_TRACT_DIR = 'TRACT/'
TIGER_BLOCKGROUP_DIR = 'BG/'

TIGER_WATER_DIR = 'AREAWATER/'

tiger_zip_file = 'tl_2013_{0}_bg.zip'.format(state_code)
tiger_shape_file = 'tl_2013_{0}_bg.shp'.format(state_code)

FULL_TIGER_URL = TIGER_BASE_URL + TIGER_BLOCKGROUP_DIR + tiger_zip_file

# Local Storage Parameters
LOCAL_DATA_DIR = './data/'
GEO_SUB_DIR = 'geo/'

ATTR_FILE_END = '_census_data.csv'
attr_outfile = LOCAL_DATA_DIR + loc_name + ATTR_FILE_END

GEO_FILE_END = '_geo_data.json'
geo_outfile = LOCAL_DATA_DIR + loc_name + GEO_FILE_END

Get census (attribute) data


In [35]:
def build_bg_fips(record):
    fips_code = record['state'] + record['county'] + record['tract'] + record['block group']
    return str(fips_code)

def census_to_dataframe(var_list, state_code, county_codes):
    fips_codes = []
    all_records = []
    
    for county in county_codes:        
        census_data = c.acs.get(var_list, {'for': 'block group:*', 'in': 'state:{0} county:{1}'.format(state_code, county)})
        
        for idx, record in enumerate(census_data):
            # Build fips codes
            fips_code = build_bg_fips(record)
            census_data[idx]["fips"] = fips_code

            # Eliminate original code components
            key_list = ['state', 'county', 'tract', 'block group']
            for key in key_list:
                if key in census_data[idx]: 
                    del census_data[idx][key]
        
        all_records.extend(census_data)
        
    census_df = pd.DataFrame(all_records)
    census_df = census_df.set_index("fips")
                
    return census_df

# This segment of code will get household income estimates for each block group in Baltimore city
census_df = census_to_dataframe(var_ids, state_code, county_codes)

In [36]:
census_df.to_csv(attr_outfile)

Get TIGER (shape) data


In [37]:
# Check if file is in directory, else download it
if os.path.isfile(LOCAL_DATA_DIR + GEO_SUB_DIR + tiger_zip_file):
    print("Already had the file.  Great.")
else:
    r = requests.get(FULL_TIGER_URL)

    if r.status_code == requests.codes.ok:
        print("Got the file! Copying to disk.")
        with open(LOCAL_DATA_DIR + GEO_SUB_DIR + tiger_zip_file, "wb") as f:
            f.write(r.content)
    else:
        print("Something went wrong. Status code: ".format(r.status_code))


Already had the file.  Great.

Trim shape data to match attributes


In [44]:
# Unzip file, extract contents
zfile = zipfile.ZipFile(LOCAL_DATA_DIR + GEO_SUB_DIR + tiger_zip_file)
zfile.extractall(LOCAL_DATA_DIR + GEO_SUB_DIR)

# Load to GeoDataFrame
shapes = gpd.GeoDataFrame.from_file(LOCAL_DATA_DIR + GEO_SUB_DIR + tiger_shape_file)

# Only keep counties that we are interested in
shapes = shapes[shapes["COUNTYFP"].isin(county_list)]

Get water data


In [41]:
# Check if file is in directory, else download it
for county in county_list:
    tiger_water_zip_file = "tl_2013_{0}{1}_areawater.zip".format(state_code, county)

    if os.path.isfile(LOCAL_DATA_DIR + GEO_SUB_DIR + tiger_water_zip_file):
        print("Already had the file.  Great.")
    else:
        r = requests.get(TIGER_BASE_URL + TIGER_WATER_DIR + tiger_water_zip_file)

        if r.status_code == requests.codes.ok:
            print("Got the file! Copying to disk.")
            with open(LOCAL_DATA_DIR + GEO_SUB_DIR + tiger_water_zip_file, "wb") as f:
                f.write(r.content)
        else:
            print("Something went wrong. Status code: ".format(r.status_code))
    
    # Unzip file, extract contents
    zfile = zipfile.ZipFile(LOCAL_DATA_DIR + GEO_SUB_DIR + tiger_water_zip_file)
    zfile.extractall(LOCAL_DATA_DIR + GEO_SUB_DIR)
    
    water_shape = g


Got the file! Copying to disk.

Eliminate unneeded attributes, export shapes to geojson


In [39]:
small_shapes = gpd.GeoDataFrame()
small_shapes["geometry"] = shapes["geometry"].simplify(tolerance=0.0001) # Simplify geometry to reduce file size
small_shapes["fips"] = shapes["GEOID"]
small_shapes = small_shapes.set_index("fips")
small_json = small_shapes.to_json()

# Write to file
with open(geo_outfile, 'w') as f:
    f.write(small_json)