notebook.community

Edit and run



In [28]:

    
import csv
import urllib
import urllib2
import os
import pandas as pd
import zipfile
import fnmatch
import shutil
import glob
import numpy as np



In [29]:

    
# read pums-p data
df_pums_p = pd.read_hdf('data/pums-p/pums-p.h5', 'd1')
df_pums_p = df_pums_p#.head(500)
df_pums_p.head(3)



In [30]:

    
df_pums_p.shape









    Out[30]:





(3164116, 9)



In [31]:

    
def classify_from_num_col(df_input, bin_col, bin_cutoffs, bin_labels, out_col):
    """
    returns dataframe that includes new column of qualititative labels from an existing numeric column
    
    df_input is input dataframe
    bin_col is column from df_input that you want to bin
    bin_cutoffs is a list of numeric thresholds; note that number of bin_cutoffs must be one more than number of bin_labels
    bin_labels are qualitative labels
    out_col is desired name of column containing qualitative data
    """
    s_temp     =  pd.cut(df_input[bin_col], bins = bin_cutoffs, labels = bin_labels)
    df_temp    =  s_temp.to_frame(name=out_col)
    df_out     =  pd.concat((df_input, df_temp[out_col]), axis=1)
    return        df_out.drop(bin_col, axis=1)



In [32]:

    
# recode age, hispanic/latino, and disabled columns
df_pums_p1 = classify_from_num_col(df_pums_p,  'AGEP', [-1, 17, 65, 99], ['child', 'adult', 'senior'], 'age')
df_pums_p2 = classify_from_num_col(df_pums_p1, 'HISP', [-1, 1, 2],       [0, 1], 'l_h')
df_pums_p3 = classify_from_num_col(df_pums_p2, 'DIS',  [-1, 1, 2],       [1, 0], 'dis')



In [33]:

    
# recode high school diploma column
df_pums_p3['hs_diploma'] = 'child'
df_pums_p3.ix[((df_pums_p3.age =='adult') & (df_pums_p3.SCHL <=17)), 'hs_diploma'] = 'none'
df_pums_p3.ix[((df_pums_p3.age =='adult') & (df_pums_p3.SCHL >17)),  'hs_diploma'] = 'diploma'
df_pums_p3.drop('SCHL', axis=1, inplace=True)



In [34]:

    
# recode race: w=white alone, b=black or aa alone, na=native american alone, a=asian alone, o=other or more than one race
df_pums_p3['race'] = 'w'
df_pums_p3.ix[(df_pums_p3.RAC1P == 2), 'race'] = 'b'
df_pums_p3.ix[(df_pums_p3.RAC1P == 3), 'race'] = 'na'
df_pums_p3.ix[(df_pums_p3.RAC1P == 4), 'race'] = 'na'
df_pums_p3.ix[(df_pums_p3.RAC1P == 5), 'race'] = 'na'
df_pums_p3.ix[(df_pums_p3.RAC1P == 6), 'race'] = 'a'
df_pums_p3.ix[(df_pums_p3.RAC1P == 7), 'race'] = 'na'
df_pums_p3.ix[(df_pums_p3.RAC1P == 8), 'race'] = 'o'
df_pums_p3.ix[(df_pums_p3.RAC1P == 9), 'race'] = 'o'
df_pums_p3.drop('RAC1P', axis=1, inplace=True)



In [35]:

    
# recode employment status
df_pums_p3['employ'] = 'child'
df_pums_p3.ix[(df_pums_p3.ESR == 1), 'employ'] = 'employed'
df_pums_p3.ix[(df_pums_p3.ESR == 2), 'employ'] = 'employed'
df_pums_p3.ix[(df_pums_p3.ESR == 3), 'employ'] = 'unemployed'
df_pums_p3.ix[(df_pums_p3.ESR == 4), 'employ'] = 'employed'
df_pums_p3.ix[(df_pums_p3.ESR == 5), 'employ'] = 'employed'
df_pums_p3.ix[(df_pums_p3.ESR == 6), 'employ'] = 'not in lf'
df_pums_p3.drop('ESR', axis=1, inplace=True)



In [36]:

    
# recode health care coverage
df_pums_p3.rename(columns={'HICOV':'hi'}, inplace=True)
df_pums_p3.ix[(df_pums_p3.hi == 2), 'hi'] = 0
df_pums_p3.head(3)









    Out[36]:






  
    
      
      SERIALNO
      CIT
      hi
      age
      l_h
      dis
      hs_diploma
      race
      employ
    
  
  
    
      0
      315
      4
      1
      adult
      0
      0
      diploma
      w
      not in lf
    
    
      1
      315
      1
      1
      adult
      0
      0
      diploma
      w
      employed
    
    
      2
      315
      1
      1
      child
      0
      0
      child
      w
      child



In [37]:

    
# recode citizenship
df_pums_p3.rename(columns={'CIT':'cit'}, inplace=True)
df_pums_p3.ix[(df_pums_p3.cit <= 4), 'cit'] = 1
df_pums_p3.ix[(df_pums_p3.cit == 5), 'cit'] = 0
df_pums_p3.head(10)



In [38]:

    
def binary_to_grp(df, gp_primary_col, gp_secondary_col, dummy_col_name):
    """
    returns percent of x within each household as new column, grouped by household
    make sure dummy_col_name is not the same as col_name and is an actual column of dataframe
    """
    gp = df.groupby([gp_primary_col,gp_secondary_col],as_index=True).count()[dummy_col_name].unstack().fillna(0)
    gp[gp_secondary_col] = gp[1] / (gp[0] + gp[1])
    cols = [0,1]
    return gp.drop(gp[cols], axis=1)



In [39]:

    
gp_lh = binary_to_grp(df_pums_p3, 'SERIALNO', 'l_h', 'cit') # get percent latino / hispanic



In [40]:

    
gp_cit = binary_to_grp(df_pums_p3, 'SERIALNO', 'cit', 'hi') # get percent citizens in hh



In [41]:

    
gp_hi = binary_to_grp(df_pums_p3, 'SERIALNO', 'hi', 'cit') # get percent w/ health ins in hh



In [42]:

    
gp_dis = binary_to_grp(df_pums_p3, 'SERIALNO', 'dis', 'cit') # get percent disabled



In [43]:

    
# total
gp_total = pd.DataFrame(data = (df_pums_p3.groupby('SERIALNO', as_index=True).count().ix[:,0]))
gp_total.rename(columns={'cit':'total'}, inplace=True)



In [44]:

    
# age
gp_age = df_pums_p3.groupby(['SERIALNO','age'],as_index=True).count()['cit'].unstack().fillna(0) # adults, children, seniors
gp_age['child'] = gp_age['child'] / gp_total['total']
gp_age['adult'] = gp_age['adult'] / gp_total['total']
gp_age['senior'] = gp_age['senior'] / gp_total['total']



In [45]:

    
# high school diplomas
gp_hs = df_pums_p3.groupby(['SERIALNO','hs_diploma'],as_index=True).count()['cit'].unstack().fillna(0)
gp_hs['hs'] = gp_hs['diploma'] / (gp_hs['diploma'] + gp_hs['none'])
gp_hs.drop(['child', 'diploma', 'none'], axis=1, inplace=True)



In [46]:

    
# employment
gp_emp = df_pums_p3.groupby(['SERIALNO','employ'],as_index=True).count()['cit'].unstack().fillna(0)
gp_emp['employ'] = gp_emp['employed'] / (gp_emp['employed'] + gp_emp['unemployed']).fillna(0)
gp_emp.drop(['child', 'employed', 'not in lf', 'unemployed'], axis=1, inplace=True)



In [47]:

    
# race
gp_race = df_pums_p3.groupby(['SERIALNO','race'],as_index=True).count()['cit'].unstack().fillna(0) # white, black, native american / pacific islander / american indian, asian, other
gp_race['a'] = gp_race['a']    /    gp_total['total']
gp_race['b'] = gp_race['b']    /    gp_total['total']
gp_race['na'] = gp_race['na']  /    gp_total['total']
gp_race['o'] = gp_race['o']    /    gp_total['total']
gp_race['w'] = gp_race['w']    /    gp_total['total']



In [48]:

    
# join tables
pums_p = pd.concat((gp_total, gp_age, gp_cit, gp_dis, gp_emp, gp_hi, gp_hs, gp_lh, gp_race), axis=1)
pums_p.shape









    Out[48]:





(1376605, 15)



In [49]:

    
pums_p = pums_p.fillna(0)
pums_p.head()



In [50]:

    
# h5 dataframe (pickling didn't work, perhaps because dataset too large)
hdf = pd.HDFStore('data/pums-p/pums-p2.h5')
hdf.put('d1', pums_p, format='table', data_columns=True)

	total	child	adult	senior	cit	dis	employ	hi	hs	l_h	a	b	na	o	w
SERIALNO
2	5	0.400000	0.600000	0.000000	1.0	0.0	1.0	0.800000	0.666667	0.0	0.0	0.0	0.0	0.0	1.0
3	3	0.333333	0.333333	0.333333	1.0	0.0	1.0	0.666667	0.000000	0.0	0.0	0.0	0.0	0.0	1.0
4	1	0.000000	1.000000	0.000000	1.0	0.0	0.0	1.000000	1.000000	0.0	0.0	1.0	0.0	0.0	0.0
5	2	0.000000	0.500000	0.500000	1.0	0.0	1.0	1.000000	1.000000	1.0	0.0	0.0	0.0	0.0	1.0
6	1	0.000000	1.000000	0.000000	1.0	0.0	1.0	1.000000	1.000000	0.0	0.0	0.0	0.0	0.0	1.0

	SERIALNO	CIT	hi	age	hs_diploma	race	employ
0	315	4	1	adult	diploma	w	not in lf
1	315	1	1	adult	diploma	w	employed
2	315	1	1	child	child	w	child