In [22]:
import csv
import urllib
import urllib2
import os
import pandas as pd
import zipfile
import fnmatch
import shutil
import glob
In [23]:
states_url = 'http://www2.census.gov/geo/docs/reference/state.txt'
response = urllib2.urlopen(states_url)
cr = csv.reader(response)
states_list_1 = []
for row in cr:
states_list_1.append(row)
states_list_1.pop(0)
states_list_2 = []
for element in states_list_1:
for string in element:
split_string = string.split("|")
states_list_2.append(split_string)
states_list_2[:3]
Out[23]:
In [24]:
df_states = pd.DataFrame(states_list_2)
df_states.columns = ['id', 'abr', 'name', 'num']
df_states['abr'] = df_states['abr'].str.lower()
df_states.head(3)
Out[24]:
In [25]:
# make directories
data_dir = 'data'
if not os.path.exists(data_dir):
os.makedirs(data_dir)
pums_dir = 'data/pums-h'
if not os.path.exists(pums_dir):
os.makedirs(pums_dir)
In [26]:
# construct the urls
# example url: http://www2.census.gov/programs-surveys/acs/data/pums/2014/1-Year/
# http://www2.census.gov/programs-surveys/acs/data/pums/2014/1-Year/csv_hak.zip
base_url = 'http://www2.census.gov/programs-surveys/acs/data/pums/'
year = '2014'
middle_url = '/1-Year/csv_h'
end_url = '.zip'
df_states['url'] = base_url + year + middle_url + df_states['abr'] + end_url
df_states.head(3)
Out[26]:
In [27]:
# construct the paths
df_states['path'] = 'data/pums-h/' + df_states['abr'] + '.zip'
df_states.head(3)
Out[27]:
In [28]:
# download zipped files and save to pums directory
for index, row in df_states.iterrows():
urllib.urlretrieve(row['url'], row['path'])
In [29]:
%%bash
rm data/pums-h/as.zip
rm data/pums-h/mp.zip
rm data/pums-h/um.zip
rm data/pums-h/gu.zip
rm data/pums-h/vi.zip
In [30]:
# unzip files
rootPath = r"/Users/peter/Dropbox/ds/metis/notebooks/projects/mcnulty/data/pums-h"
pattern = '*.zip'
for root, dirs, files in os.walk(rootPath):
for filename in fnmatch.filter(files, pattern):
#print(os.path.join(root, filename))
zipfile.ZipFile(os.path.join(root, filename)).extractall(os.path.join(root, os.path.splitext(filename)[0]))
In [31]:
# make list of tuples of csvs
base_path = 'data/pums-h/*/*.'
csv_list = glob.glob(base_path + 'csv')
csv_list[:5]
Out[31]:
In [32]:
# make dataframe of all the csvs
df = pd.DataFrame()
temp_list = []
for csv in csv_list:
dataframe = pd.read_csv(csv, index_col=None, header=0)
temp_list.append(dataframe)
df = pd.concat(temp_list)
In [33]:
df.head()
Out[33]:
In [34]:
# remove unnecessary files in pums folder
shutil.rmtree('data/pums-h')
In [35]:
pums_dir = 'data/pums-h'
if not os.path.exists(pums_dir):
os.makedirs(pums_dir)
In [53]:
# http://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMSDataDict14.pdf
cols_list = [
'RT' # record type, H indicates house
,'SERIALNO' # housing unit / GQ person serial number...unique identifier
,'DIVISION' # division code
,'PUMA' # public use microdata area code, based on 2010 census defn, 100k people per puma
,'REGION' # region code
,'ST' # state code
,'WGTP' # housing weight
,'ADJINC' # adj for income & earnings to 2014 values, divide by 1 million to do conversion
,'ADJHSG' # adj for housing dollar amounts, divide by 1 million to do conversion
,'HINCP' # household income past 12 months
,'FINCP' # family income past 12 months
,'NOC' # number of own children in household (unweighted)
,'NRC' # number of related children in household (unweighted)
,'WORKSTAT' # work status
,'WATP' # yearly water cost
,'CONP' # condo fee / month
,'ELEP' # electricity fee / month
,'FULP' # fuel cost / year
,'GASP' # gas cost / month
,'GRNTP' # gross rent / month, use grntp to adjust to constant dollars
,'GRPIP' # gross income as % of hh income past 12 months
,'INSP' # fire/hazard/flood insurance amount / year
,'MHP' # mobile home costs / year
,'RNTP' # rent cost / month
,'SMP' # total payment on all second and junior morgages and home equity loans / month
,'SMOCP' # selected monthly owner costs, use adjhsg to adjust to constant dollars
,'TAXP' # property taxes per year
,'MRGP' # first mortgage payment / month
,'FS' # yearly food stamp / snap recipiency
,'TEN' # tenure: b n/a, 1 owned, 2 owned free & clear, 3 rented, 4 occupied w/out payment of rent
,'TYPE' # type of unit: 1 housing unit, 2 institutional group quarters, 3 non-institutional group quarters
,'BLD' # units in structure
,'FPARC' # family presence and age of related children
,'FES' # family type and employment status
,'KIT' # complete kitchen facilities
,'PLM' # complete plumbing facilities: b n/a, 1 yes, 2 no, 9 case from pr so plm n/a
,'MV' # when moved into this house or apartment
,'VEH' # number of vehicles available
]
In [54]:
# select desired columns
df1 = df[cols_list]
df1.head(3)
Out[54]:
In [55]:
# pickle dataframe
df1.to_pickle('data/pums-h/metadata.pickle')