In [28]:
import csv
import urllib
import urllib2
import os
import pandas as pd
import zipfile
import fnmatch
import shutil
import glob
import numpy as np
In [29]:
# read pums-p data
df_pums_p = pd.read_hdf('data/pums-p/pums-p.h5', 'd1')
df_pums_p = df_pums_p#.head(500)
df_pums_p.head(3)
Out[29]:
In [30]:
df_pums_p.shape
Out[30]:
In [31]:
def classify_from_num_col(df_input, bin_col, bin_cutoffs, bin_labels, out_col):
"""
returns dataframe that includes new column of qualititative labels from an existing numeric column
df_input is input dataframe
bin_col is column from df_input that you want to bin
bin_cutoffs is a list of numeric thresholds; note that number of bin_cutoffs must be one more than number of bin_labels
bin_labels are qualitative labels
out_col is desired name of column containing qualitative data
"""
s_temp = pd.cut(df_input[bin_col], bins = bin_cutoffs, labels = bin_labels)
df_temp = s_temp.to_frame(name=out_col)
df_out = pd.concat((df_input, df_temp[out_col]), axis=1)
return df_out.drop(bin_col, axis=1)
In [32]:
# recode age, hispanic/latino, and disabled columns
df_pums_p1 = classify_from_num_col(df_pums_p, 'AGEP', [-1, 17, 65, 99], ['child', 'adult', 'senior'], 'age')
df_pums_p2 = classify_from_num_col(df_pums_p1, 'HISP', [-1, 1, 2], [0, 1], 'l_h')
df_pums_p3 = classify_from_num_col(df_pums_p2, 'DIS', [-1, 1, 2], [1, 0], 'dis')
In [33]:
# recode high school diploma column
df_pums_p3['hs_diploma'] = 'child'
df_pums_p3.ix[((df_pums_p3.age =='adult') & (df_pums_p3.SCHL <=17)), 'hs_diploma'] = 'none'
df_pums_p3.ix[((df_pums_p3.age =='adult') & (df_pums_p3.SCHL >17)), 'hs_diploma'] = 'diploma'
df_pums_p3.drop('SCHL', axis=1, inplace=True)
In [34]:
# recode race: w=white alone, b=black or aa alone, na=native american alone, a=asian alone, o=other or more than one race
df_pums_p3['race'] = 'w'
df_pums_p3.ix[(df_pums_p3.RAC1P == 2), 'race'] = 'b'
df_pums_p3.ix[(df_pums_p3.RAC1P == 3), 'race'] = 'na'
df_pums_p3.ix[(df_pums_p3.RAC1P == 4), 'race'] = 'na'
df_pums_p3.ix[(df_pums_p3.RAC1P == 5), 'race'] = 'na'
df_pums_p3.ix[(df_pums_p3.RAC1P == 6), 'race'] = 'a'
df_pums_p3.ix[(df_pums_p3.RAC1P == 7), 'race'] = 'na'
df_pums_p3.ix[(df_pums_p3.RAC1P == 8), 'race'] = 'o'
df_pums_p3.ix[(df_pums_p3.RAC1P == 9), 'race'] = 'o'
df_pums_p3.drop('RAC1P', axis=1, inplace=True)
In [35]:
# recode employment status
df_pums_p3['employ'] = 'child'
df_pums_p3.ix[(df_pums_p3.ESR == 1), 'employ'] = 'employed'
df_pums_p3.ix[(df_pums_p3.ESR == 2), 'employ'] = 'employed'
df_pums_p3.ix[(df_pums_p3.ESR == 3), 'employ'] = 'unemployed'
df_pums_p3.ix[(df_pums_p3.ESR == 4), 'employ'] = 'employed'
df_pums_p3.ix[(df_pums_p3.ESR == 5), 'employ'] = 'employed'
df_pums_p3.ix[(df_pums_p3.ESR == 6), 'employ'] = 'not in lf'
df_pums_p3.drop('ESR', axis=1, inplace=True)
In [36]:
# recode health care coverage
df_pums_p3.rename(columns={'HICOV':'hi'}, inplace=True)
df_pums_p3.ix[(df_pums_p3.hi == 2), 'hi'] = 0
df_pums_p3.head(3)
Out[36]:
In [37]:
# recode citizenship
df_pums_p3.rename(columns={'CIT':'cit'}, inplace=True)
df_pums_p3.ix[(df_pums_p3.cit <= 4), 'cit'] = 1
df_pums_p3.ix[(df_pums_p3.cit == 5), 'cit'] = 0
df_pums_p3.head(10)
Out[37]:
In [38]:
def binary_to_grp(df, gp_primary_col, gp_secondary_col, dummy_col_name):
"""
returns percent of x within each household as new column, grouped by household
make sure dummy_col_name is not the same as col_name and is an actual column of dataframe
"""
gp = df.groupby([gp_primary_col,gp_secondary_col],as_index=True).count()[dummy_col_name].unstack().fillna(0)
gp[gp_secondary_col] = gp[1] / (gp[0] + gp[1])
cols = [0,1]
return gp.drop(gp[cols], axis=1)
In [39]:
gp_lh = binary_to_grp(df_pums_p3, 'SERIALNO', 'l_h', 'cit') # get percent latino / hispanic
In [40]:
gp_cit = binary_to_grp(df_pums_p3, 'SERIALNO', 'cit', 'hi') # get percent citizens in hh
In [41]:
gp_hi = binary_to_grp(df_pums_p3, 'SERIALNO', 'hi', 'cit') # get percent w/ health ins in hh
In [42]:
gp_dis = binary_to_grp(df_pums_p3, 'SERIALNO', 'dis', 'cit') # get percent disabled
In [43]:
# total
gp_total = pd.DataFrame(data = (df_pums_p3.groupby('SERIALNO', as_index=True).count().ix[:,0]))
gp_total.rename(columns={'cit':'total'}, inplace=True)
In [44]:
# age
gp_age = df_pums_p3.groupby(['SERIALNO','age'],as_index=True).count()['cit'].unstack().fillna(0) # adults, children, seniors
gp_age['child'] = gp_age['child'] / gp_total['total']
gp_age['adult'] = gp_age['adult'] / gp_total['total']
gp_age['senior'] = gp_age['senior'] / gp_total['total']
In [45]:
# high school diplomas
gp_hs = df_pums_p3.groupby(['SERIALNO','hs_diploma'],as_index=True).count()['cit'].unstack().fillna(0)
gp_hs['hs'] = gp_hs['diploma'] / (gp_hs['diploma'] + gp_hs['none'])
gp_hs.drop(['child', 'diploma', 'none'], axis=1, inplace=True)
In [46]:
# employment
gp_emp = df_pums_p3.groupby(['SERIALNO','employ'],as_index=True).count()['cit'].unstack().fillna(0)
gp_emp['employ'] = gp_emp['employed'] / (gp_emp['employed'] + gp_emp['unemployed']).fillna(0)
gp_emp.drop(['child', 'employed', 'not in lf', 'unemployed'], axis=1, inplace=True)
In [47]:
# race
gp_race = df_pums_p3.groupby(['SERIALNO','race'],as_index=True).count()['cit'].unstack().fillna(0) # white, black, native american / pacific islander / american indian, asian, other
gp_race['a'] = gp_race['a'] / gp_total['total']
gp_race['b'] = gp_race['b'] / gp_total['total']
gp_race['na'] = gp_race['na'] / gp_total['total']
gp_race['o'] = gp_race['o'] / gp_total['total']
gp_race['w'] = gp_race['w'] / gp_total['total']
In [48]:
# join tables
pums_p = pd.concat((gp_total, gp_age, gp_cit, gp_dis, gp_emp, gp_hi, gp_hs, gp_lh, gp_race), axis=1)
pums_p.shape
Out[48]:
In [49]:
pums_p = pums_p.fillna(0)
pums_p.head()
Out[49]:
In [50]:
# h5 dataframe (pickling didn't work, perhaps because dataset too large)
hdf = pd.HDFStore('data/pums-p/pums-p2.h5')
hdf.put('d1', pums_p, format='table', data_columns=True)