notebook.community

Edit and run



In [5]:

    
import csv
import urllib
import urllib2
import os
import pandas as pd
import zipfile
import fnmatch
import shutil
import glob



In [6]:

    
states_url = 'http://www2.census.gov/geo/docs/reference/state.txt'
response = urllib2.urlopen(states_url)
cr = csv.reader(response)

states_list_1 = []
for row in cr:
    states_list_1.append(row)
states_list_1.pop(0)

states_list_2 = []
for element in states_list_1:
    for string in element:
        split_string = string.split("|")
        states_list_2.append(split_string)

states_list_2[:3]









    Out[6]:





[['01', 'AL', 'Alabama', '01779775'],
 ['02', 'AK', 'Alaska', '01785533'],
 ['04', 'AZ', 'Arizona', '01779777']]



In [7]:

    
df_states = pd.DataFrame(states_list_2)
df_states.columns = ['id', 'abr', 'name', 'num']
df_states['abr'] = df_states['abr'].str.lower()
df_states.head(3)



In [8]:

    
# make directories

data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
pums_dir = 'data/pums-p'
if not os.path.exists(pums_dir):
    os.makedirs(pums_dir)



In [9]:

    
# construct the urls
# example url: http://www2.census.gov/programs-surveys/acs/data/pums/2014/1-Year/
# http://www2.census.gov/programs-surveys/acs/data/pums/2014/1-Year/csv_pak.zip
base_url = 'http://www2.census.gov/programs-surveys/acs/data/pums/'
year = '2014'
middle_url = '/1-Year/csv_p'
end_url = '.zip'
df_states['url'] = base_url + year + middle_url + df_states['abr'] + end_url
df_states.head(3)









    Out[9]:






  
    
      
      id
      abr
      name
      num
      url
    
  
  
    
      0
      01
      al
      Alabama
      01779775
      http://www2.census.gov/programs-surveys/acs/da...
    
    
      1
      02
      ak
      Alaska
      01785533
      http://www2.census.gov/programs-surveys/acs/da...
    
    
      2
      04
      az
      Arizona
      01779777
      http://www2.census.gov/programs-surveys/acs/da...



In [10]:

    
# construct the paths
df_states['path'] = 'data/pums-p/' + df_states['abr'] + '.zip'
df_states.head(3)









    Out[10]:






  
    
      
      id
      abr
      name
      num
      url
      path
    
  
  
    
      0
      01
      al
      Alabama
      01779775
      http://www2.census.gov/programs-surveys/acs/da...
      data/pums-p/al.zip
    
    
      1
      02
      ak
      Alaska
      01785533
      http://www2.census.gov/programs-surveys/acs/da...
      data/pums-p/ak.zip
    
    
      2
      04
      az
      Arizona
      01779777
      http://www2.census.gov/programs-surveys/acs/da...
      data/pums-p/az.zip



In [11]:

    
# download zipped files and save to pums directory
for index, row in df_states.iterrows():
    urllib.urlretrieve(row['url'], row['path'])



In [12]:

    
%%bash
rm data/pums-p/as.zip
rm data/pums-p/mp.zip
rm data/pums-p/um.zip
rm data/pums-p/gu.zip
rm data/pums-p/vi.zip



In [13]:

    
# unzip files

rootPath = r"/Users/peter/Dropbox/ds/metis/notebooks/projects/mcnulty/data/pums-p"
pattern = '*.zip'

for root, dirs, files in os.walk(rootPath):
    for filename in fnmatch.filter(files, pattern):
        #print(os.path.join(root, filename))
        zipfile.ZipFile(os.path.join(root, filename)).extractall(os.path.join(root, os.path.splitext(filename)[0]))



In [14]:

    
# make list of tuples of csvs

base_path = 'data/pums-p/*/*.'
csv_list = glob.glob(base_path + 'csv')

csv_list[:5]









    Out[14]:





['data/pums-p/ak/ss14pak.csv',
 'data/pums-p/al/ss14pal.csv',
 'data/pums-p/ar/ss14par.csv',
 'data/pums-p/az/ss14paz.csv',
 'data/pums-p/ca/ss14pca.csv']



In [15]:

    
# make dataframe of all the csvs
df = pd.DataFrame()
temp_list = []

for csv in csv_list:
    dataframe = pd.read_csv(csv, index_col=None, header=0)
    temp_list.append(dataframe)
df = pd.concat(temp_list)









    



/Users/peter/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2902: DtypeWarning: Columns (95,104,126) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [16]:

    
df.head()









    Out[16]:






  
    
      
      RT
      SERIALNO
      SPORDER
      PUMA
      ST
      ADJINC
      PWGTP
      AGEP
      CIT
      CITWP
      ...
      pwgtp71
      pwgtp72
      pwgtp73
      pwgtp74
      pwgtp75
      pwgtp76
      pwgtp77
      pwgtp78
      pwgtp79
      pwgtp80
    
  
  
    
      0
      P
      315
      1
      200
      2
      1008425
      161
      30
      4
      2008
      ...
      165
      156
      44
      43
      243
      213
      56
      138
      243
      163
    
    
      1
      P
      315
      2
      200
      2
      1008425
      155
      34
      1
      
      ...
      178
      141
      47
      45
      253
      227
      39
      134
      278
      144
    
    
      2
      P
      315
      3
      200
      2
      1008425
      255
      7
      1
      
      ...
      289
      251
      68
      71
      438
      386
      64
      230
      396
      230
    
    
      3
      P
      315
      4
      200
      2
      1008425
      204
      5
      1
      
      ...
      222
      194
      56
      63
      350
      344
      60
      217
      322
      180
    
    
      4
      P
      315
      5
      200
      2
      1008425
      180
      4
      1
      
      ...
      192
      143
      56
      52
      304
      248
      52
      165
      255
      176
    
  

5 rows × 284 columns



In [17]:

    
# remove unnecessary files in pums folder
shutil.rmtree('data/pums-p')



In [18]:

    
pums_dir = 'data/pums-p'
if not os.path.exists(pums_dir):
    os.makedirs(pums_dir)



In [19]:

    
# http://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMSDataDict14.pdf

cols_list = [
    #'RT' # record type, H indicates house
    'SERIALNO' # housing unit / GQ person serial number...unique identifier
    #,'SPORDER' # person order
    #,'PUMA' # public use microdata area code, based on 2010 census defn, 100k people per puma
    #,'ST' # state code
    #,'PWGTP' # person weight
    #,'ADJINC' # adj for income & earnings to 2014 values, divide by 1 million to do conversion
    ,'ESR' # employment status of recode: b n/a, less than 16 years old, 1 civilian employed, 2 civilian employed w/ job but not at work, 3 unemployed, 4 armed forces at work, 5 armed forces w/ job but not at work, 6 not in labor force
    ,'CIT' # citizenship status
    ,'RAC1P' # recorded detailed race code: 1 white alone, 2 black or aa alone, 3 american indian alone, 4 alaska native alone, 5 american indian / alaska native tribe specified or not, 6 asian alone, 7 native hawaiian / pacific islander alone, 8 some other race alone, 9 two or more races
    ,'HISP' # hispanic: 0 is no and all other numbers yes
    ,'SCHL' # educational attainment
    ,'HICOV' # health insurance coverage recode
    #,'ESP' # employment status of parents
    ,'DIS' # disability recode: 1 disability, 2 no disability
    ,'AGEP' # age: 00 is under one year, max of 99
]



In [20]:

    
# select desired columns
df1 = df[cols_list]
df1.head(3)



In [21]:

    
df1.shape









    Out[21]:





(3164116, 9)



In [22]:

    
# h5 dataframe (pickling didn't work, perhaps because dataset too large)
hdf = pd.HDFStore('data/pums-p/pums-p.h5')
hdf.put('d1', df1, format='table', data_columns=True)

	id	abr	name	num	url
0	01	al	Alabama	01779775	http://www2.census.gov/programs-surveys/acs/da...
1	02	ak	Alaska	01785533	http://www2.census.gov/programs-surveys/acs/da...
2	04	az	Arizona	01779777	http://www2.census.gov/programs-surveys/acs/da...

	RT	SERIALNO	SPORDER	PUMA	ST	ADJINC	PWGTP	AGEP	CIT	CITWP	...	pwgtp71	pwgtp72	pwgtp73	pwgtp74	pwgtp75	pwgtp76	pwgtp77	pwgtp78	pwgtp79	pwgtp80
0	P	315	1	200	2	1008425	161	30	4	2008	...	165	156	44	43	243	213	56	138	243	163
1	P	315	2	200	2	1008425	155	34	1		...	178	141	47	45	253	227	39	134	278	144
2	P	315	3	200	2	1008425	255	7	1		...	289	251	68	71	438	386	64	230	396	230
3	P	315	4	200	2	1008425	204	5	1		...	222	194	56	63	350	344	60	217	322	180
4	P	315	5	200	2	1008425	180	4	1		...	192	143	56	52	304	248	52	165	255	176