notebook.community

Edit and run



In [22]:

    
import csv
import urllib
import urllib2
import os
import pandas as pd
import zipfile
import fnmatch
import shutil
import glob



In [23]:

    
states_url = 'http://www2.census.gov/geo/docs/reference/state.txt'
response = urllib2.urlopen(states_url)
cr = csv.reader(response)

states_list_1 = []
for row in cr:
    states_list_1.append(row)
states_list_1.pop(0)

states_list_2 = []
for element in states_list_1:
    for string in element:
        split_string = string.split("|")
        states_list_2.append(split_string)

states_list_2[:3]









    Out[23]:





[['01', 'AL', 'Alabama', '01779775'],
 ['02', 'AK', 'Alaska', '01785533'],
 ['04', 'AZ', 'Arizona', '01779777']]



In [24]:

    
df_states = pd.DataFrame(states_list_2)
df_states.columns = ['id', 'abr', 'name', 'num']
df_states['abr'] = df_states['abr'].str.lower()
df_states.head(3)



In [25]:

    
# make directories

data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
pums_dir = 'data/pums-h'
if not os.path.exists(pums_dir):
    os.makedirs(pums_dir)



In [26]:

    
# construct the urls
# example url: http://www2.census.gov/programs-surveys/acs/data/pums/2014/1-Year/
# http://www2.census.gov/programs-surveys/acs/data/pums/2014/1-Year/csv_hak.zip
base_url = 'http://www2.census.gov/programs-surveys/acs/data/pums/'
year = '2014'
middle_url = '/1-Year/csv_h'
end_url = '.zip'
df_states['url'] = base_url + year + middle_url + df_states['abr'] + end_url
df_states.head(3)









    Out[26]:






  
    
      
      id
      abr
      name
      num
      url
    
  
  
    
      0
      01
      al
      Alabama
      01779775
      http://www2.census.gov/programs-surveys/acs/da...
    
    
      1
      02
      ak
      Alaska
      01785533
      http://www2.census.gov/programs-surveys/acs/da...
    
    
      2
      04
      az
      Arizona
      01779777
      http://www2.census.gov/programs-surveys/acs/da...



In [27]:

    
# construct the paths
df_states['path'] = 'data/pums-h/' + df_states['abr'] + '.zip'
df_states.head(3)









    Out[27]:






  
    
      
      id
      abr
      name
      num
      url
      path
    
  
  
    
      0
      01
      al
      Alabama
      01779775
      http://www2.census.gov/programs-surveys/acs/da...
      data/pums-h/al.zip
    
    
      1
      02
      ak
      Alaska
      01785533
      http://www2.census.gov/programs-surveys/acs/da...
      data/pums-h/ak.zip
    
    
      2
      04
      az
      Arizona
      01779777
      http://www2.census.gov/programs-surveys/acs/da...
      data/pums-h/az.zip



In [28]:

    
# download zipped files and save to pums directory
for index, row in df_states.iterrows():
    urllib.urlretrieve(row['url'], row['path'])



In [29]:

    
%%bash
rm data/pums-h/as.zip
rm data/pums-h/mp.zip
rm data/pums-h/um.zip
rm data/pums-h/gu.zip
rm data/pums-h/vi.zip



In [30]:

    
# unzip files

rootPath = r"/Users/peter/Dropbox/ds/metis/notebooks/projects/mcnulty/data/pums-h"
pattern = '*.zip'

for root, dirs, files in os.walk(rootPath):
    for filename in fnmatch.filter(files, pattern):
        #print(os.path.join(root, filename))
        zipfile.ZipFile(os.path.join(root, filename)).extractall(os.path.join(root, os.path.splitext(filename)[0]))



In [31]:

    
# make list of tuples of csvs

base_path = 'data/pums-h/*/*.'
csv_list = glob.glob(base_path + 'csv')

csv_list[:5]









    Out[31]:





['data/pums-h/ak/ss14hak.csv',
 'data/pums-h/al/ss14hal.csv',
 'data/pums-h/ar/ss14har.csv',
 'data/pums-h/az/ss14haz.csv',
 'data/pums-h/ca/ss14hca.csv']



In [32]:

    
# make dataframe of all the csvs
df = pd.DataFrame()
temp_list = []

for csv in csv_list:
    dataframe = pd.read_csv(csv, index_col=None, header=0)
    temp_list.append(dataframe)
df = pd.concat(temp_list)



In [33]:

    
df.head()









    Out[33]:






  
    
      
      RT
      SERIALNO
      DIVISION
      PUMA
      REGION
      ST
      ADJHSG
      ADJINC
      WGTP
      NP
      ...
      wgtp71
      wgtp72
      wgtp73
      wgtp74
      wgtp75
      wgtp76
      wgtp77
      wgtp78
      wgtp79
      wgtp80
    
  
  
    
      0
      H
      77
      9
      400
      4
      2
      1000000
      1008425
      22
      0
      ...
      24
      36
      24
      36
      6
      6
      20
      22
      22
      38
    
    
      1
      H
      315
      9
      200
      4
      2
      1000000
      1008425
      161
      5
      ...
      165
      156
      44
      43
      244
      212
      56
      137
      244
      164
    
    
      2
      H
      807
      9
      300
      4
      2
      1000000
      1008425
      1303
      0
      ...
      1909
      2330
      463
      1339
      1384
      446
      1416
      381
      1443
      502
    
    
      3
      H
      1408
      9
      200
      4
      2
      1000000
      1008425
      50
      3
      ...
      62
      79
      92
      55
      45
      51
      49
      91
      56
      41
    
    
      4
      H
      1508
      9
      101
      4
      2
      1000000
      1008425
      125
      4
      ...
      130
      35
      134
      228
      125
      36
      36
      119
      155
      108
    
  

5 rows × 235 columns



In [34]:

    
# remove unnecessary files in pums folder
shutil.rmtree('data/pums-h')



In [35]:

    
pums_dir = 'data/pums-h'
if not os.path.exists(pums_dir):
    os.makedirs(pums_dir)



In [53]:

    
# http://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMSDataDict14.pdf

cols_list = [
     'RT' # record type, H indicates house
    ,'SERIALNO' # housing unit / GQ person serial number...unique identifier
    ,'DIVISION' # division code
    ,'PUMA' # public use microdata area code, based on 2010 census defn, 100k people per puma
    ,'REGION' # region code
    ,'ST' # state code
    ,'WGTP' # housing weight
    ,'ADJINC' # adj for income & earnings to 2014 values, divide by 1 million to do conversion
    ,'ADJHSG' # adj for housing dollar amounts, divide by 1 million to do conversion
    ,'HINCP' # household income past 12 months
    ,'FINCP' # family income past 12 months
    ,'NOC' # number of own children in household (unweighted)
    ,'NRC' # number of related children in household (unweighted)
    ,'WORKSTAT' # work status
    ,'WATP' # yearly water cost
    ,'CONP' # condo fee / month
    ,'ELEP' # electricity fee / month
    ,'FULP' # fuel cost / year
    ,'GASP' # gas cost / month
    ,'GRNTP' # gross rent / month, use grntp to adjust to constant dollars
    ,'GRPIP' # gross income as % of hh income past 12 months
    ,'INSP' # fire/hazard/flood insurance amount / year
    ,'MHP' # mobile home costs / year
    ,'RNTP' # rent cost / month
    ,'SMP' # total payment on all second and junior morgages and home equity loans / month
    ,'SMOCP' # selected monthly owner costs, use adjhsg to adjust to constant dollars
    ,'TAXP' # property taxes per year
    ,'MRGP' # first mortgage payment / month
    ,'FS' # yearly food stamp / snap recipiency
    ,'TEN' # tenure: b n/a, 1 owned, 2 owned free & clear, 3 rented, 4 occupied w/out payment of rent
    ,'TYPE' # type of unit: 1 housing unit, 2 institutional group quarters, 3 non-institutional group quarters
    ,'BLD' # units in structure
    ,'FPARC' # family presence and age of related children
    ,'FES' # family type and employment status
    ,'KIT' # complete kitchen facilities
    ,'PLM' # complete plumbing facilities: b n/a, 1 yes, 2 no, 9 case from pr so plm n/a
    ,'MV' # when moved into this house or apartment
    ,'VEH' # number of vehicles available
]



In [54]:

    
# select desired columns
df1 = df[cols_list]
df1.head(3)









    Out[54]:






  
    
      
      RT
      SERIALNO
      DIVISION
      PUMA
      REGION
      ST
      WGTP
      ADJINC
      ADJHSG
      HINCP
      ...
      FS
      TEN
      TYPE
      BLD
      FPARC
      FES
      KIT
      PLM
      MV
      VEH
    
  
  
    
      0
      H
      77
      9
      400
      4
      2
      22
      1008425
      1000000
      
      ...
      NaN
      NaN
      1
      2.0
      NaN
      NaN
      2.0
      2.0
      NaN
      NaN
    
    
      1
      H
      315
      9
      200
      4
      2
      161
      1008425
      1000000
      000050660
      ...
      2.0
      1.0
      1
      2.0
      3.0
      2.0
      1.0
      1.0
      4.0
      3.0
    
    
      2
      H
      807
      9
      300
      4
      2
      1303
      1008425
      1000000
      
      ...
      NaN
      NaN
      1
      3.0
      NaN
      NaN
      1.0
      1.0
      NaN
      NaN
    
  

3 rows × 38 columns



In [55]:

    
# pickle dataframe
df1.to_pickle('data/pums-h/metadata.pickle')

	id	abr	name	num	url
0	01	al	Alabama	01779775	http://www2.census.gov/programs-surveys/acs/da...
1	02	ak	Alaska	01785533	http://www2.census.gov/programs-surveys/acs/da...
2	04	az	Arizona	01779777	http://www2.census.gov/programs-surveys/acs/da...

	RT	SERIALNO	DIVISION	PUMA	REGION	ST	ADJHSG	ADJINC	WGTP	NP	...	wgtp71	wgtp72	wgtp73	wgtp74	wgtp75	wgtp76	wgtp77	wgtp78	wgtp79	wgtp80
0	H	77	9	400	4	2	1000000	1008425	22	0	...	24	36	24	36	6	6	20	22	22	38
1	H	315	9	200	4	2	1000000	1008425	161	5	...	165	156	44	43	244	212	56	137	244	164
2	H	807	9	300	4	2	1000000	1008425	1303	0	...	1909	2330	463	1339	1384	446	1416	381	1443	502
3	H	1408	9	200	4	2	1000000	1008425	50	3	...	62	79	92	55	45	51	49	91	56	41
4	H	1508	9	101	4	2	1000000	1008425	125	4	...	130	35	134	228	125	36	36	119	155	108