In [2]:
import pandas as pd

In [3]:
DATA_DIR = '/Users/joe/Dropbox/SFI_CensusData/UnitedStates/'

acs_df = pd.read_csv(DATA_DIR + '2010acs.csv')
acs_df['ID'] = acs_df.apply(lambda x: 
                            str(x['STATE_ID']).zfill(2) + str(x['COUNTY_ID']).zfill(3) + str(x['TRACT_ID']).zfill(5) + str(x['BKGP_ID']).zfill(1), axis=1)

acs_df = acs_df.set_index('ID', drop=False) # Set index but don't drop

In [4]:
acs_df.head()


Out[4]:
ID AVGHINC HINC000 HINC000_W HINC010 HINC010_W HINC015 HINC015_W HINC020 HINC020_W ... total totalmoe employed employedmoe unemployed unemployedmoe armedforces armedforcesmoe notinlaborforce notinlaborforcemoe
ID
01001201001 01001201001 83221 22 1.075274 10 0.609443 0 0.000000 11 0.789033 ... 440 121 245 96 0 119 10 21 180 74
01001201002 01001201002 96162 56 1.431417 0 0.000000 0 0.000000 9 0.337619 ... 950 147 670 116 35 28 10 15 240 88
01001202001 01001202001 50327 9 0.264817 82 3.008532 32 1.473958 12 0.518193 ... 960 228 390 138 35 35 40 38 495 130
01001202002 01001202002 47692 12 0.420951 23 1.006039 37 2.031810 0 0.000000 ... 560 162 400 130 35 33 0 119 125 64
01001203001 01001203001 60738 86 1.084880 0 0.000000 12 0.236971 22 0.407298 ... 2025 271 1230 207 40 40 35 40 720 145

5 rows × 63 columns


In [9]:
edu_vars = {'B15002_001E': 'Edu Total',
 'B15002_002E': 'Edu Male Total',
 'B15002_003E': 'Edu Male Level 0',
 'B15002_004E': 'Edu Male Level 1',
 'B15002_005E': 'Edu Male Level 2',
 'B15002_006E': 'Edu Male Level 3',
 'B15002_007E': 'Edu Male Level 4',
 'B15002_008E': 'Edu Male Level 5',
 'B15002_009E': 'Edu Male Level 6',
 'B15002_010E': 'Edu Male Level 7',
 'B15002_011E': 'Edu Male Level 8',
 'B15002_012E': 'Edu Male Level 9',
 'B15002_013E': 'Edu Male Level 10',
 'B15002_014E': 'Edu Male Level 11',
 'B15002_015E': 'Edu Male Level 12',
 'B15002_016E': 'Edu Male Level 13',
 'B15002_017E': 'Edu Male Level 14',
 'B15002_018E': 'Edu Male Level 15',
 'B15002_020E': 'Edu Female Level 0',
 'B15002_021E': 'Edu Female Level 1',
 'B15002_022E': 'Edu Female Level 2',
 'B15002_023E': 'Edu Female Level 3',
 'B15002_024E': 'Edu Female Level 4',
 'B15002_025E': 'Edu Female Level 5',
 'B15002_026E': 'Edu Female Level 6',
 'B15002_027E': 'Edu Female Level 7',
 'B15002_028E': 'Edu Female Level 8',
 'B15002_029E': 'Edu Female Level 9',
 'B15002_030E': 'Edu Female Level 10',
 'B15002_031E': 'Edu Female Level 11',
 'B15002_032E': 'Edu Female Level 12',
 'B15002_033E': 'Edu Female Level 13',
 'B15002_034E': 'Edu Female Level 14',
 'B15002_035E': 'Edu Female Level 15',
 'GEOID': 'GeoId'}

In [10]:
edu_df = pd.read_csv(DATA_DIR + '2010acs_edu.csv', encoding='utf-8', dtype={'ID':'str'})

edu_df['ID'] = edu_df.apply(lambda x: 
                             str(x['state']).zfill(2) + str(x['county']).zfill(3) + str(x['tract']).zfill(5) + str(x['block group']).zfill(1), axis=1)
edu_df = edu_df.rename(columns=edu_vars)
edu_df = edu_df.set_index('ID', drop=True) # Set index but don't drop

edu_df = edu_df.drop(['state', 'county', 'tract', 'block group', 'GeoId'], axis=1)

In [11]:
edu_df.head()


Out[11]:
Edu Total Edu Male Total Edu Male Level 0 Edu Male Level 1 Edu Male Level 2 Edu Male Level 3 Edu Male Level 4 Edu Male Level 5 Edu Male Level 6 Edu Male Level 7 ... Edu Female Level 6 Edu Female Level 7 Edu Female Level 8 Edu Female Level 9 Edu Female Level 10 Edu Female Level 11 Edu Female Level 12 Edu Female Level 13 Edu Female Level 14 Edu Female Level 15
ID
480019501001 920 388 7 0 1 24 11 5 24 30 ... 45 0 174 13 81 28 54 10 0 0
480019501002 1560 907 0 11 43 11 25 61 97 15 ... 10 12 226 20 192 37 77 6 0 3
480019501003 1251 745 0 31 19 30 0 3 17 0 ... 5 0 216 55 64 14 25 8 0 0
480019504011 6876 6843 28 66 125 571 477 634 794 64 ... 0 0 0 0 12 8 0 0 0 0
480019504021 3574 3507 28 0 23 271 223 249 351 82 ... 0 0 39 0 12 16 0 0 0 0

5 rows × 34 columns


In [12]:
df = acs_df.join(edu_df)

In [13]:
df.head()


Out[13]:
ID AVGHINC HINC000 HINC000_W HINC010 HINC010_W HINC015 HINC015_W HINC020 HINC020_W ... Edu Female Level 6 Edu Female Level 7 Edu Female Level 8 Edu Female Level 9 Edu Female Level 10 Edu Female Level 11 Edu Female Level 12 Edu Female Level 13 Edu Female Level 14 Edu Female Level 15
ID
01001201001 01001201001 83221 22 1.075274 10 0.609443 0 0.000000 11 0.789033 ... 0 0 65 27 45 7 20 73 0 0
01001201002 01001201002 96162 56 1.431417 0 0.000000 0 0.000000 9 0.337619 ... 23 0 154 0 55 52 61 0 0 0
01001202001 01001202001 50327 9 0.264817 82 3.008532 32 1.473958 12 0.518193 ... 50 11 172 0 82 11 53 19 15 0
01001202002 01001202002 47692 12 0.420951 23 1.006039 37 2.031810 0 0.000000 ... 45 0 62 7 0 31 33 0 0 8
01001203001 01001203001 60738 86 1.084880 0 0.000000 12 0.236971 22 0.407298 ... 18 52 353 46 117 62 138 23 7 0

5 rows × 97 columns


In [27]:
df = df.drop(df.filter(regex='(M|m)ale').columns, axis=1)
#df = df.drop(['ID.1', 'DKL(y|n)', 'H(y|n)'], axis=1)


df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 200120 entries, 01001201001 to 560419754006
Data columns (total 77 columns):
ID                    200120 non-null object
AVGHINC               198533 non-null float64
HINC000               200120 non-null int64
HINC000_W             198906 non-null float64
HINC010               200120 non-null int64
HINC010_W             198906 non-null float64
HINC015               200120 non-null int64
HINC015_W             198906 non-null float64
HINC020               200120 non-null int64
HINC020_W             198906 non-null float64
HINC025               200120 non-null int64
HINC025_W             198906 non-null float64
HINC030               200120 non-null int64
HINC030_W             198906 non-null float64
HINC035               200120 non-null int64
HINC035_W             198906 non-null float64
HINC040               200120 non-null int64
HINC040_W             198906 non-null float64
HINC045               200120 non-null int64
HINC045_W             198906 non-null float64
HINC050               200120 non-null int64
HINC050_W             198906 non-null float64
HINC060               200120 non-null int64
HINC060_W             198906 non-null float64
HINC075               200120 non-null int64
HINC075_W             198906 non-null float64
HINC100               200120 non-null int64
HINC100_W             198906 non-null float64
HINC125               200120 non-null int64
HINC125_W             198906 non-null float64
HINC150               200120 non-null int64
HINC150_W             198906 non-null float64
HINC200               200120 non-null int64
HINC200_W             198906 non-null float64
MEDHINC               198705 non-null float64
BKGP_ID               200120 non-null int64
CITY_NAME             200120 non-null object
CITY_ID               200120 non-null int64
COUNTY_ID             200120 non-null int64
STATE                 200120 non-null object
STATE_ID              200120 non-null int64
TOTHH                 200120 non-null int64
TOTPOP                200120 non-null int64
TRACT_ID              200120 non-null int64
geoid                 200120 non-null object
name                  200120 non-null object
state                 200120 non-null int64
county                200120 non-null int64
tract                 200120 non-null int64
blockgroup            200120 non-null int64
total                 199282 non-null float64
totalmoe              199282 non-null float64
employed              199282 non-null float64
employedmoe           199282 non-null float64
unemployed            199282 non-null float64
unemployedmoe         199282 non-null float64
armedforces           199282 non-null float64
armedforcesmoe        199282 non-null float64
notinlaborforce       199282 non-null float64
notinlaborforcemoe    199282 non-null float64
Edu Total             200120 non-null int64
Edu_Level_0           200120 non-null int64
Edu_Level_1           200120 non-null int64
Edu_Level_2           200120 non-null int64
Edu_Level_3           200120 non-null int64
Edu_Level_4           200120 non-null int64
Edu_Level_5           200120 non-null int64
Edu_Level_6           200120 non-null int64
Edu_Level_7           200120 non-null int64
Edu_Level_8           200120 non-null int64
Edu_Level_9           200120 non-null int64
Edu_Level_10          200120 non-null int64
Edu_Level_11          200120 non-null int64
Edu_Level_12          200120 non-null int64
Edu_Level_13          200120 non-null int64
Edu_Level_14          200120 non-null int64
Edu_Level_15          200120 non-null int64
dtypes: float64(28), int64(44), object(5)
memory usage: 119.1+ MB

In [28]:
df.to_csv('/Users/joe/Dropbox/SFI_CensusData/UnitedStates/2010acs_edu.csv')

In [ ]: