In [28]:
import csv
import urllib
import urllib2
import os
import pandas as pd
import zipfile
import fnmatch
import shutil
import glob
import numpy as np

In [29]:
# read pums-p data
df_pums_p = pd.read_hdf('data/pums-p/pums-p.h5', 'd1')
df_pums_p = df_pums_p#.head(500)
df_pums_p.head(3)


Out[29]:
SERIALNO ESR CIT RAC1P HISP SCHL HICOV DIS AGEP
0 315 6.0 4 1 1 19.0 1 2 30
1 315 1.0 1 1 1 20.0 1 2 34
2 315 NaN 1 1 1 3.0 1 2 7

In [30]:
df_pums_p.shape


Out[30]:
(3164116, 9)

In [31]:
def classify_from_num_col(df_input, bin_col, bin_cutoffs, bin_labels, out_col):
    """
    returns dataframe that includes new column of qualititative labels from an existing numeric column
    
    df_input is input dataframe
    bin_col is column from df_input that you want to bin
    bin_cutoffs is a list of numeric thresholds; note that number of bin_cutoffs must be one more than number of bin_labels
    bin_labels are qualitative labels
    out_col is desired name of column containing qualitative data
    """
    s_temp     =  pd.cut(df_input[bin_col], bins = bin_cutoffs, labels = bin_labels)
    df_temp    =  s_temp.to_frame(name=out_col)
    df_out     =  pd.concat((df_input, df_temp[out_col]), axis=1)
    return        df_out.drop(bin_col, axis=1)

In [32]:
# recode age, hispanic/latino, and disabled columns
df_pums_p1 = classify_from_num_col(df_pums_p,  'AGEP', [-1, 17, 65, 99], ['child', 'adult', 'senior'], 'age')
df_pums_p2 = classify_from_num_col(df_pums_p1, 'HISP', [-1, 1, 2],       [0, 1], 'l_h')
df_pums_p3 = classify_from_num_col(df_pums_p2, 'DIS',  [-1, 1, 2],       [1, 0], 'dis')

In [33]:
# recode high school diploma column
df_pums_p3['hs_diploma'] = 'child'
df_pums_p3.ix[((df_pums_p3.age =='adult') & (df_pums_p3.SCHL <=17)), 'hs_diploma'] = 'none'
df_pums_p3.ix[((df_pums_p3.age =='adult') & (df_pums_p3.SCHL >17)),  'hs_diploma'] = 'diploma'
df_pums_p3.drop('SCHL', axis=1, inplace=True)

In [34]:
# recode race: w=white alone, b=black or aa alone, na=native american alone, a=asian alone, o=other or more than one race
df_pums_p3['race'] = 'w'
df_pums_p3.ix[(df_pums_p3.RAC1P == 2), 'race'] = 'b'
df_pums_p3.ix[(df_pums_p3.RAC1P == 3), 'race'] = 'na'
df_pums_p3.ix[(df_pums_p3.RAC1P == 4), 'race'] = 'na'
df_pums_p3.ix[(df_pums_p3.RAC1P == 5), 'race'] = 'na'
df_pums_p3.ix[(df_pums_p3.RAC1P == 6), 'race'] = 'a'
df_pums_p3.ix[(df_pums_p3.RAC1P == 7), 'race'] = 'na'
df_pums_p3.ix[(df_pums_p3.RAC1P == 8), 'race'] = 'o'
df_pums_p3.ix[(df_pums_p3.RAC1P == 9), 'race'] = 'o'
df_pums_p3.drop('RAC1P', axis=1, inplace=True)

In [35]:
# recode employment status
df_pums_p3['employ'] = 'child'
df_pums_p3.ix[(df_pums_p3.ESR == 1), 'employ'] = 'employed'
df_pums_p3.ix[(df_pums_p3.ESR == 2), 'employ'] = 'employed'
df_pums_p3.ix[(df_pums_p3.ESR == 3), 'employ'] = 'unemployed'
df_pums_p3.ix[(df_pums_p3.ESR == 4), 'employ'] = 'employed'
df_pums_p3.ix[(df_pums_p3.ESR == 5), 'employ'] = 'employed'
df_pums_p3.ix[(df_pums_p3.ESR == 6), 'employ'] = 'not in lf'
df_pums_p3.drop('ESR', axis=1, inplace=True)

In [36]:
# recode health care coverage
df_pums_p3.rename(columns={'HICOV':'hi'}, inplace=True)
df_pums_p3.ix[(df_pums_p3.hi == 2), 'hi'] = 0
df_pums_p3.head(3)


Out[36]:
SERIALNO CIT hi age l_h dis hs_diploma race employ
0 315 4 1 adult 0 0 diploma w not in lf
1 315 1 1 adult 0 0 diploma w employed
2 315 1 1 child 0 0 child w child

In [37]:
# recode citizenship
df_pums_p3.rename(columns={'CIT':'cit'}, inplace=True)
df_pums_p3.ix[(df_pums_p3.cit <= 4), 'cit'] = 1
df_pums_p3.ix[(df_pums_p3.cit == 5), 'cit'] = 0
df_pums_p3.head(10)


Out[37]:
SERIALNO cit hi age l_h dis hs_diploma race employ
0 315 1 1 adult 0 0 diploma w not in lf
1 315 1 1 adult 0 0 diploma w employed
2 315 1 1 child 0 0 child w child
3 315 1 1 child 0 0 child w child
4 315 1 1 child 0 0 child w child
5 1408 1 1 adult 0 0 diploma w not in lf
6 1408 1 1 adult 0 0 diploma w employed
7 1408 1 0 adult 0 0 diploma w employed
8 1508 1 1 adult 0 0 diploma w employed
9 1508 1 1 adult 0 0 diploma w employed

In [38]:
def binary_to_grp(df, gp_primary_col, gp_secondary_col, dummy_col_name):
    """
    returns percent of x within each household as new column, grouped by household
    make sure dummy_col_name is not the same as col_name and is an actual column of dataframe
    """
    gp = df.groupby([gp_primary_col,gp_secondary_col],as_index=True).count()[dummy_col_name].unstack().fillna(0)
    gp[gp_secondary_col] = gp[1] / (gp[0] + gp[1])
    cols = [0,1]
    return gp.drop(gp[cols], axis=1)

In [39]:
gp_lh = binary_to_grp(df_pums_p3, 'SERIALNO', 'l_h', 'cit') # get percent latino / hispanic

In [40]:
gp_cit = binary_to_grp(df_pums_p3, 'SERIALNO', 'cit', 'hi') # get percent citizens in hh

In [41]:
gp_hi = binary_to_grp(df_pums_p3, 'SERIALNO', 'hi', 'cit') # get percent w/ health ins in hh

In [42]:
gp_dis = binary_to_grp(df_pums_p3, 'SERIALNO', 'dis', 'cit') # get percent disabled

In [43]:
# total
gp_total = pd.DataFrame(data = (df_pums_p3.groupby('SERIALNO', as_index=True).count().ix[:,0]))
gp_total.rename(columns={'cit':'total'}, inplace=True)

In [44]:
# age
gp_age = df_pums_p3.groupby(['SERIALNO','age'],as_index=True).count()['cit'].unstack().fillna(0) # adults, children, seniors
gp_age['child'] = gp_age['child'] / gp_total['total']
gp_age['adult'] = gp_age['adult'] / gp_total['total']
gp_age['senior'] = gp_age['senior'] / gp_total['total']

In [45]:
# high school diplomas
gp_hs = df_pums_p3.groupby(['SERIALNO','hs_diploma'],as_index=True).count()['cit'].unstack().fillna(0)
gp_hs['hs'] = gp_hs['diploma'] / (gp_hs['diploma'] + gp_hs['none'])
gp_hs.drop(['child', 'diploma', 'none'], axis=1, inplace=True)

In [46]:
# employment
gp_emp = df_pums_p3.groupby(['SERIALNO','employ'],as_index=True).count()['cit'].unstack().fillna(0)
gp_emp['employ'] = gp_emp['employed'] / (gp_emp['employed'] + gp_emp['unemployed']).fillna(0)
gp_emp.drop(['child', 'employed', 'not in lf', 'unemployed'], axis=1, inplace=True)

In [47]:
# race
gp_race = df_pums_p3.groupby(['SERIALNO','race'],as_index=True).count()['cit'].unstack().fillna(0) # white, black, native american / pacific islander / american indian, asian, other
gp_race['a'] = gp_race['a']    /    gp_total['total']
gp_race['b'] = gp_race['b']    /    gp_total['total']
gp_race['na'] = gp_race['na']  /    gp_total['total']
gp_race['o'] = gp_race['o']    /    gp_total['total']
gp_race['w'] = gp_race['w']    /    gp_total['total']

In [48]:
# join tables
pums_p = pd.concat((gp_total, gp_age, gp_cit, gp_dis, gp_emp, gp_hi, gp_hs, gp_lh, gp_race), axis=1)
pums_p.shape


Out[48]:
(1376605, 15)

In [49]:
pums_p = pums_p.fillna(0)
pums_p.head()


Out[49]:
total child adult senior cit dis employ hi hs l_h a b na o w
SERIALNO
2 5 0.400000 0.600000 0.000000 1.0 0.0 1.0 0.800000 0.666667 0.0 0.0 0.0 0.0 0.0 1.0
3 3 0.333333 0.333333 0.333333 1.0 0.0 1.0 0.666667 0.000000 0.0 0.0 0.0 0.0 0.0 1.0
4 1 0.000000 1.000000 0.000000 1.0 0.0 0.0 1.000000 1.000000 0.0 0.0 1.0 0.0 0.0 0.0
5 2 0.000000 0.500000 0.500000 1.0 0.0 1.0 1.000000 1.000000 1.0 0.0 0.0 0.0 0.0 1.0
6 1 0.000000 1.000000 0.000000 1.0 0.0 1.0 1.000000 1.000000 0.0 0.0 0.0 0.0 0.0 1.0

In [50]:
# h5 dataframe (pickling didn't work, perhaps because dataset too large)
hdf = pd.HDFStore('data/pums-p/pums-p2.h5')
hdf.put('d1', pums_p, format='table', data_columns=True)