In [1]:
%load_ext autoreload
%autoreload 2
from synthpop.census_helpers import Census
from synthpop import categorizer as cat
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_columns', 500)

The census api needs a key - you can register for can sign up


In [2]:
c = Census(os.environ["CENSUS"])

Here we get aggregate information on households from ACS - note some variables are associated with block groups and others with tracts


In [3]:
income_columns = ['B19001_0%02dE'%i for i in range(1, 18)]
vehicle_columns = ['B08201_0%02dE'%i for i in range(1, 7)]
workers_columns = ['B08202_0%02dE'%i for i in range(1, 6)]
families_columns = ['B11001_001E', 'B11001_002E']
block_group_columns = income_columns + families_columns
tract_columns = vehicle_columns + workers_columns
h_acs = c.block_group_and_tract_query(block_group_columns,
                tract_columns, "06", "075", 
                merge_columns=['tract', 'county', 'state'],
                block_group_size_attr="B11001_001E",
                tract_size_attr="B08201_001E",
                tract="030600")
h_acs


Out[3]:
B11001_001E B11001_002E B19001_001E B19001_002E B19001_003E B19001_004E B19001_005E B19001_006E B19001_007E B19001_008E B19001_009E B19001_010E B19001_011E B19001_012E B19001_013E B19001_014E B19001_015E B19001_016E B19001_017E NAME block group county state tract B08201_001E B08201_002E B08201_003E B08201_004E B08201_005E B08201_006E B08202_001E B08202_002E B08202_003E B08202_004E B08202_005E
0 300 205 300 7 0 10 18 34 0 5 13 0 20 9 27 44 4 35 74 Block Group 1, Census Tract 306, San Francisco... 1 075 06 030600 300 9 91 152 42 4 300 68 119 101 10
1 273 194 273 0 0 0 13 0 0 15 6 0 0 6 35 14 36 64 84 Block Group 2, Census Tract 306, San Francisco... 2 075 06 030600 273 9 83 138 38 3 273 62 109 92 9
2 305 240 305 0 0 10 5 4 0 0 0 0 6 12 11 24 18 39 176 Block Group 3, Census Tract 306, San Francisco... 3 075 06 030600 305 10 93 154 43 4 305 69 121 102 10

And here is aggregate information on people from ACS


In [4]:
population = ['B01001_001E']
sex = ['B01001_002E', 'B01001_026E']
race = ['B02001_0%02dE'%i for i in range(1,11)]
male_age_columns = ['B01001_0%02dE'%i for i in range(3,26)]
female_age_columns = ['B01001_0%02dE'%i for i in range(27,50)]
all_columns = population + sex + race + male_age_columns + female_age_columns
p_acs = c.block_group_query(all_columns, "06", "075", tract="030600")
p_acs


Out[4]:
B01001_001E B01001_002E B01001_003E B01001_004E B01001_005E B01001_006E B01001_007E B01001_008E B01001_009E B01001_010E B01001_011E B01001_012E B01001_013E B01001_014E B01001_015E B01001_016E B01001_017E B01001_018E B01001_019E B01001_020E B01001_021E B01001_022E B01001_023E B01001_024E B01001_025E B01001_026E B01001_027E B01001_028E B01001_029E B01001_030E B01001_031E B01001_032E B01001_033E B01001_034E B01001_035E B02001_001E B02001_002E B02001_003E B02001_004E B02001_005E B02001_006E B02001_007E B02001_008E B02001_009E B02001_010E NAME block group county state tract B01001_036E B01001_037E B01001_038E B01001_039E B01001_040E B01001_041E B01001_042E B01001_043E B01001_044E B01001_045E B01001_046E B01001_047E B01001_048E B01001_049E
0 789 392 56 40 24 8 0 0 0 26 27 12 4 13 18 35 13 10 20 19 29 11 27 0 0 397 0 20 16 34 0 9 11 10 21 789 367 9 0 261 0 101 51 19 32 Block Group 1, Census Tract 306, San Francisco... 1 075 06 030600 16 34 6 15 34 29 14 20 15 30 20 9 23 11
1 696 342 18 16 6 0 5 0 0 0 0 34 35 52 67 12 23 17 0 0 7 12 22 5 11 354 12 28 34 6 0 0 0 8 9 696 447 0 0 179 0 24 46 0 46 Block Group 2, Census Tract 306, San Francisco... 2 075 06 030600 31 39 21 18 20 26 26 13 6 13 23 9 6 6
2 939 518 70 51 18 30 6 0 6 8 19 5 25 28 41 54 70 6 8 2 7 34 5 9 16 421 47 33 19 17 0 0 0 11 13 939 532 12 0 310 0 0 85 0 85 Block Group 3, Census Tract 306, San Francisco... 3 075 06 030600 0 36 46 24 39 49 10 0 0 12 9 6 18 32

Get the puma for our test tracts - this actually downloads the mapping file from the census website so it might take a few seconds


In [5]:
puma = c.tract_to_puma("06", "075", "030600")
puma


Out[5]:
('07506', '02206')

In [6]:
puma10 = puma[0]
puma00 = puma[1]

Download PUMS for people records for a PUMA from our server (we processed the large files into smaller ones for you)


In [7]:
p_pums = c.download_population_pums("06", puma10=puma10, puma00=puma00)
p_pums.head(5)


C:\Users\Juan\Anaconda3\envs\synpop_py3\lib\site-packages\IPython\core\interactiveshell.py:2910: DtypeWarning: Columns (108) have mixed types. Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)
Out[7]:
serialno RT SPORDER puma00 puma10 ST ADJINC PWGTP AGEP CIT CITWP05 CITWP12 COW DDRS DEAR DEYE DOUT DPHY DRAT DRATX DREM ENG FER GCL GCM GCR HINS1 HINS2 HINS3 HINS4 HINS5 HINS6 HINS7 INTP JWMNP JWRIP JWTR LANX MAR MARHD MARHM MARHT MARHW MARHYP05 MARHYP12 MIG MIL MLPA MLPB MLPCD MLPE MLPFG MLPH MLPI MLPJ MLPK NWAB NWAV NWLA NWLK NWRE OIP PAP RELP RETP SCH SCHG SCHL SEMP SEX SSIP SSP WAGP WKHP WKL WKW WRK YOEP05 YOEP12 ANC ANC1P05 ANC1P12 ANC2P05 ANC2P12 DECADE DIS DRIVESP ESP ESR FOD1P FOD2P HICOV HISP INDP JWAP JWDP LANP05 LANP12 MIGPUMA00 MIGPUMA10 MIGSP05 MIGSP12 MSP NAICSP NATIVITY NOP OC OCCP02 OCCP10 OCCP12 PAOC PERNP PINCP POBP05 POBP12 POVPIP POWPUMA00 POWPUMA10 POWSP05 POWSP12 PRIVCOV PUBCOV QTRBIR RAC1P RAC2P05 RAC2P12 RAC3P05 RAC3P12 RACAIAN RACASN RACBLK RACNHPI RACNUM RACSOR RACWHT RC SCIENGP SCIENGRLP SFN SFR SOCP00 SOCP10 SOCP12 VPS WAOB FAGEP FANCP FCITP FCITWP FCOWP FDDRSP FDEARP FDEYEP FDOUTP FDPHYP FDRATP FDRATXP FDREMP FENGP FESRP FFERP FFODP FGCLP FGCMP FGCRP FHINS1P FHINS2P FHINS3C FHINS3P FHINS4C FHINS4P FHINS5C FHINS5P FHINS6P FHINS7P FHISP FINDP FINTP FJWDP FJWMNP FJWRIP FJWTRP FLANP FLANXP FMARHDP FMARHMP FMARHTP FMARHWP FMARHYP FMARP FMIGP FMIGSP FMILPP FMILSP FOCCP FOIP FPAP FPOBP FPOWSP FRACP FRELP FRETP FSCHGP FSCHLP FSCHP FSEMP FSEXP FSSIP FSSP FWAGP FWKHP FWKLP FWKWP FWRKP FYOEP PWGTP1 PWGTP2 PWGTP3 PWGTP4 PWGTP5 PWGTP6 PWGTP7 PWGTP8 PWGTP9 PWGTP10 PWGTP11 PWGTP12 PWGTP13 PWGTP14 PWGTP15 PWGTP16 PWGTP17 PWGTP18 PWGTP19 PWGTP20 PWGTP21 PWGTP22 PWGTP23 PWGTP24 PWGTP25 PWGTP26 PWGTP27 PWGTP28 PWGTP29 PWGTP30 PWGTP31 PWGTP32 PWGTP33 PWGTP34 PWGTP35 PWGTP36 PWGTP37 PWGTP38 PWGTP39 PWGTP40 PWGTP41 PWGTP42 PWGTP43 PWGTP44 PWGTP45 PWGTP46 PWGTP47 PWGTP48 PWGTP49 PWGTP50 PWGTP51 PWGTP52 PWGTP53 PWGTP54 PWGTP55 PWGTP56 PWGTP57 PWGTP58 PWGTP59 PWGTP60 PWGTP61 PWGTP62 PWGTP63 PWGTP64 PWGTP65 PWGTP66 PWGTP67 PWGTP68 PWGTP69 PWGTP70 PWGTP71 PWGTP72 PWGTP73 PWGTP74 PWGTP75 PWGTP76 PWGTP77 PWGTP78 PWGTP79 PWGTP80
0 2012000002680 P 1 -9 7506 6 1024887 21 64 1 NaN NaN 2.0 2.0 2 2 2.0 2.0 NaN NaN 2.0 NaN NaN 2.0 NaN NaN 1 2 2 2 2 2 2 0.0 10.0 NaN 10.0 2.0 5 NaN NaN NaN NaN NaN NaN 1.0 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 5.0 2.0 2.0 3.0 0.0 0.0 0 0.0 1.0 NaN 21.0 0.0 2 0.0 0.0 59000.0 24.0 1.0 1.0 1.0 NaN NaN 3 -9 995 -9 999 NaN 2 NaN NaN 1.0 3301.0 NaN 1 1 7870.0 84.0 43.0 NaN NaN NaN NaN NaN NaN 6.0 611M1 1 NaN 0 N.A. N.A. 5860 4.0 59000.0 59000.0 -9 36 500.0 -9.0 7500.0 -9.0 6.0 1 2 4 1 -9 1 -9 1 0 0 0 0 1 0 1 0 2.0 2.0 NaN NaN N.A.// N.A.// 439061 NaN 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NaN 0 NaN 0 NaN 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0.0 0 22 18 23 7 25 23 38 30 6 5 40 20 7 33 35 23 22 6 22 21 20 20 23 37 18 19 7 6 46 37 7 21 36 7 6 16 17 36 19 19 19 19 20 39 19 20 7 7 37 38 7 20 32 7 7 22 21 33 20 18 20 23 23 6 19 21 40 43 5 7 40 20 6 35 39 22 26 6 20 21
1 2012000009189 P 1 -9 7506 6 1024887 14 52 4 -9.0 1995.0 NaN 2.0 2 1 2.0 2.0 NaN NaN 2.0 3.0 NaN 2.0 NaN NaN 1 2 2 2 2 2 2 0.0 NaN NaN NaN 1.0 1 2.0 2.0 1.0 2.0 -9.0 1992.0 1.0 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 5.0 3.0 3.0 3.0 10000.0 0.0 0 0.0 1.0 NaN 16.0 0.0 1 0.0 0.0 0.0 NaN 3.0 NaN NaN -9.0 1989.0 4 -9 999 -9 999 5.0 1 NaN NaN 6.0 NaN NaN 1 1 NaN NaN NaN -9.0 708.0 NaN NaN NaN NaN 1.0 NaN 2 NaN 0 NaN NaN NaN NaN 0.0 10000.0 -9 207 43.0 NaN NaN NaN NaN 1 2 3 6 -9 43 -9 5 0 1 0 0 1 0 0 0 NaN NaN NaN NaN NaN NaN NaN NaN 4 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 0 NaN 0 NaN 0 NaN 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 1 0 1 1 1 0 1 0 0.0 0 15 5 26 18 6 10 11 5 15 4 15 17 17 20 11 21 16 12 28 26 14 28 4 4 31 15 13 27 19 25 16 14 14 4 12 3 15 12 4 3 15 4 24 27 5 14 12 4 16 3 17 14 14 26 10 27 17 15 30 20 19 28 5 4 22 14 11 26 14 21 10 16 19 4 14 5 13 14 5 4
2 2012000009189 P 2 -9 7506 6 1024887 18 45 4 -9.0 1998.0 3.0 2.0 2 1 2.0 2.0 NaN NaN 2.0 3.0 2.0 2.0 NaN NaN 1 2 2 2 2 2 2 0.0 NaN NaN NaN 1.0 1 2.0 2.0 1.0 2.0 -9.0 1992.0 1.0 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 5.0 3.0 3.0 3.0 0.0 0.0 1 0.0 1.0 NaN 16.0 0.0 2 0.0 0.0 0.0 NaN 2.0 NaN NaN -9.0 1992.0 4 -9 999 -9 999 6.0 1 NaN NaN 6.0 NaN NaN 1 1 8370.0 NaN NaN -9.0 708.0 NaN NaN NaN NaN 1.0 6241 2 NaN 0 N.A. N.A. 4610 2.0 0.0 0.0 -9 207 43.0 NaN NaN NaN NaN 1 2 2 6 -9 43 -9 5 0 1 0 0 1 0 0 0 NaN NaN NaN NaN N.A.// N.A.// 399021 NaN 4 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 0 0 0 0 0 NaN 0 NaN 0 NaN 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 0 0 0 1 0 1 1 1 0 1 0 0.0 0 18 5 36 30 7 20 19 6 23 7 15 17 30 30 16 28 20 19 28 34 18 32 7 5 38 19 19 33 18 34 18 16 19 5 18 5 17 20 5 5 23 7 25 30 5 19 21 6 18 7 21 19 20 26 18 31 18 23 30 31 19 37 6 6 36 23 18 28 17 32 15 21 22 6 24 4 15 21 5 6
3 2012000009189 P 3 -9 7506 6 1024887 14 10 1 NaN NaN NaN 2.0 2 2 NaN 2.0 NaN NaN 2.0 NaN NaN NaN NaN NaN 2 2 2 1 2 2 2 NaN NaN NaN NaN 2.0 5 NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2 NaN 1.0 NaN 7.0 NaN 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN 4 -9 999 -9 999 NaN 2 NaN 4.0 NaN NaN NaN 1 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1 4.0 1 NaN NaN NaN NaN NaN NaN -9 6 43.0 NaN NaN NaN NaN 2 1 4 6 -9 43 -9 5 0 1 0 0 1 0 0 1 NaN NaN NaN NaN NaN NaN NaN NaN 1 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 NaN 0 0.0 0 NaN 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0 15 3 22 21 4 13 15 4 17 5 11 12 17 24 16 23 14 14 24 25 17 22 5 5 25 14 14 24 11 27 13 14 13 4 15 4 16 13 4 5 14 5 28 26 4 20 13 3 14 4 17 13 13 24 12 21 14 15 22 22 16 23 4 4 23 15 15 23 14 28 15 12 17 5 12 5 13 15 5 4
4 2012000009189 P 4 -9 7506 6 1024887 14 8 1 NaN NaN NaN 2.0 2 2 NaN 2.0 NaN NaN 2.0 NaN NaN NaN NaN NaN 2 2 2 1 2 2 2 NaN NaN NaN NaN 2.0 5 NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2 NaN 2.0 5.0 6.0 NaN 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN 4 -9 999 -9 999 NaN 2 NaN 4.0 NaN NaN NaN 1 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1 4.0 1 NaN NaN NaN NaN NaN NaN -9 6 43.0 NaN NaN NaN NaN 2 1 1 6 -9 43 -9 5 0 1 0 0 1 0 0 1 NaN NaN NaN NaN NaN NaN NaN NaN 1 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0 0 0 0 1 1 NaN 1 0.0 1 NaN 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0.0 0 15 4 22 21 4 12 16 5 17 5 11 12 17 24 16 22 15 14 24 25 17 22 4 5 25 13 14 25 12 27 13 14 13 4 14 4 17 13 4 4 14 5 28 26 3 19 13 3 15 4 16 13 13 24 12 22 14 16 21 22 15 23 4 3 22 15 15 23 14 28 15 12 17 5 13 4 13 15 4 3

Download PUMS for household records for a PUMA


In [8]:
h_pums = c.download_household_pums("06", puma10=puma10, puma00=puma00)
h_pums.head(5)


Out[8]:
serialno insp RT DIVISION puma00 puma10 REGION ST ADJHSG ADJINC WGTP NP TYPE ACR AGS BATH BDSP BLD BUS CONP ELEP FS FULP GASP HFL MHP MRGI MRGP MRGT MRGX REFR RMSP RNTM RNTP RWAT RWATPR SINK SMP STOV TEL TEN TOIL VACS VALP VEH WATP YBL FES FINCP FPARC GRNTP GRPIP HHL HHT HINCP HUGCL HUPAC HUPAOC HUPARC KIT LNGI MULTG MV NOC NPF NPP NR NRC OCPIP PARTNER PLM PSF R18 R60 R65 RESMODE SMOCP SMX SRNT SVAL TAXP WIF WKEXREL WORKSTAT FACRP FAGSP FBATHP FBDSP FBLDP FBUSP FCONP FELEP FFSP FFULP FGASP FHFLP FINSP FKITP FMHP FMRGIP FMRGP FMRGTP FMRGXP FMVP FPLMP FREFRP FRMSP FRNTMP FRNTP FRWATP FRWATPRP FSINKP FSMP FSMXHP FSMXSP FSTOVP FTAXP FTELP FTENP FTOILP FVACSP FVALP FVEHP FWATP FYBLP WGTP1 WGTP2 WGTP3 WGTP4 WGTP5 WGTP6 WGTP7 WGTP8 WGTP9 WGTP10 WGTP11 WGTP12 WGTP13 WGTP14 WGTP15 WGTP16 WGTP17 WGTP18 WGTP19 WGTP20 WGTP21 WGTP22 WGTP23 WGTP24 WGTP25 WGTP26 WGTP27 WGTP28 WGTP29 WGTP30 WGTP31 WGTP32 WGTP33 WGTP34 WGTP35 WGTP36 WGTP37 WGTP38 WGTP39 WGTP40 WGTP41 WGTP42 WGTP43 WGTP44 WGTP45 WGTP46 WGTP47 WGTP48 WGTP49 WGTP50 WGTP51 WGTP52 WGTP53 WGTP54 WGTP55 WGTP56 WGTP57 WGTP58 WGTP59 WGTP60 WGTP61 WGTP62 WGTP63 WGTP64 WGTP65 WGTP66 WGTP67 WGTP68 WGTP69 WGTP70 WGTP71 WGTP72 WGTP73 WGTP74 WGTP75 WGTP76 WGTP77 WGTP78 WGTP79 WGTP80
0 2012000002680 200.0 H 9 -9 7506 4 6 1014531 1024887 21 1 1 1.0 NaN 1.0 2.0 3.0 2.0 0.0 60.0 2.0 2.0 4.0 9.0 NaN 2.0 350.0 2.0 1.0 1.0 5.0 NaN NaN 1.0 9.0 1.0 NaN 1.0 1.0 1.0 1.0 NaN 500000.0 1.0 430.0 1.0 NaN NaN NaN NaN NaN 1.0 6.0 59000.0 0.0 4.0 4.0 4.0 1.0 1.0 1.0 6.0 0.0 NaN 0.0 0.0 0.0 13.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 634.0 3.0 0.0 1.0 32.0 NaN NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 22 18 23 7 25 23 37 30 6 5 40 20 7 33 35 23 22 6 22 21 20 20 23 37 18 19 7 6 46 36 7 21 36 8 5 17 18 36 19 18 19 19 20 38 19 20 7 6 37 38 7 20 31 7 6 22 21 32 20 18 20 23 23 7 20 20 40 43 5 7 40 20 6 35 39 22 26 6 20 22
1 2012000009189 660.0 H 9 -9 7506 4 6 1014531 1024887 14 4 1 NaN NaN 1.0 5.0 4.0 NaN 0.0 50.0 2.0 2.0 50.0 1.0 NaN 2.0 490.0 2.0 1.0 1.0 7.0 NaN NaN 1.0 9.0 1.0 630.0 1.0 1.0 1.0 1.0 NaN 40000.0 2.0 2000.0 1.0 4.0 10000.0 2.0 NaN NaN 4.0 1.0 10000.0 0.0 2.0 2.0 2.0 1.0 2.0 1.0 5.0 2.0 4.0 0.0 0.0 2.0 101.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 1534.0 2.0 0.0 0.0 23.0 0.0 9.0 9.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 15 5 26 17 5 10 12 5 15 4 14 17 17 20 11 22 16 12 27 26 15 27 4 4 32 16 13 27 18 26 17 13 15 4 12 3 15 12 4 4 15 4 24 28 5 14 11 4 17 3 17 14 14 26 10 27 17 15 31 21 18 28 6 3 22 14 11 25 14 20 10 16 19 3 14 5 13 14 5 4
2 2012000016466 NaN H 9 -9 7506 4 6 1014531 1024887 14 3 1 1.0 NaN 1.0 2.0 2.0 2.0 0.0 10.0 2.0 2.0 1.0 1.0 NaN NaN NaN NaN NaN 1.0 4.0 2.0 2000.0 1.0 9.0 1.0 NaN 1.0 1.0 3.0 1.0 NaN NaN 1.0 1.0 2.0 7.0 30000.0 2.0 2010.0 80.0 1.0 3.0 30000.0 0.0 2.0 2.0 2.0 1.0 1.0 1.0 2.0 2.0 3.0 0.0 0.0 2.0 NaN 0.0 1.0 0.0 1.0 0.0 0.0 1.0 NaN NaN 1.0 0.0 NaN 1.0 13.0 13.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4 5 25 15 4 23 23 14 15 5 15 20 17 14 4 16 16 25 25 12 24 17 4 15 24 3 6 15 17 24 13 16 14 10 20 15 15 4 5 15 4 5 29 16 4 23 25 10 15 5 11 10 13 17 5 14 16 31 20 15 27 20 5 19 32 7 4 16 11 24 12 13 13 16 23 12 11 4 4 14
3 2012000017340 500.0 H 9 -9 7506 4 6 1014531 1024887 11 4 1 1.0 NaN 1.0 3.0 3.0 2.0 0.0 100.0 2.0 1000.0 30.0 3.0 NaN 2.0 3000.0 2.0 1.0 1.0 4.0 NaN NaN 1.0 9.0 1.0 NaN 1.0 1.0 1.0 1.0 NaN 500000.0 1.0 1200.0 7.0 5.0 60000.0 4.0 NaN NaN 4.0 2.0 60000.0 0.0 4.0 4.0 4.0 1.0 1.0 1.0 4.0 0.0 4.0 0.0 0.0 0.0 79.0 0.0 1.0 1.0 0.0 1.0 0.0 1.0 3938.0 3.0 0.0 1.0 65.0 2.0 12.0 11.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 10 15 2 9 3 8 14 11 16 9 22 10 11 23 12 5 28 21 4 23 11 11 13 9 15 9 11 12 4 10 4 11 10 4 12 15 3 4 18 3 13 7 4 12 3 13 9 11 30 12 17 12 12 18 10 3 18 21 4 13 10 8 17 12 20 10 11 10 4 10 3 12 10 2 10 16 3 4 17 2
4 2012000025664 200.0 H 9 -9 7506 4 6 1014531 1024887 85 5 1 1.0 NaN 1.0 3.0 3.0 2.0 0.0 50.0 2.0 2.0 3.0 3.0 NaN 2.0 680.0 2.0 1.0 1.0 6.0 NaN NaN 1.0 9.0 1.0 NaN 1.0 1.0 1.0 1.0 NaN 500000.0 2.0 900.0 4.0 7.0 73500.0 4.0 NaN NaN 1.0 3.0 73500.0 0.0 4.0 4.0 4.0 1.0 1.0 2.0 7.0 0.0 5.0 0.0 0.0 0.0 16.0 0.0 1.0 0.0 0.0 1.0 1.0 2.0 989.0 3.0 0.0 1.0 32.0 3.0 13.0 13.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 1.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 18 25 147 72 26 96 123 64 83 28 93 78 98 77 37 90 102 135 91 85 108 138 35 91 138 42 43 77 68 163 86 76 101 74 106 100 111 30 28 87 118 131 40 90 100 27 36 79 76 137 106 104 92 82 166 85 86 35 24 79 23 26 116 72 29 118 143 85 85 49 110 88 70 83 33 81 85 107 116 102

Now the job is to categorize acs and pums into the same categories - we start with the household acs data


In [9]:
h_acs_cat = cat.categorize(h_acs, {
    ("households", "total"): "B11001_001E",
    ("children", "yes"): "B11001_002E",
    ("children", "no"): "B11001_001E - B11001_002E",
    ("income", "lt35"): "B19001_002E + B19001_003E + B19001_004E + "
                        "B19001_005E + B19001_006E + B19001_007E",
    ("income", "gt35-lt100"): "B19001_008E + B19001_009E + "
                        "B19001_010E + B19001_011E + B19001_012E"
                        "+ B19001_013E",
    ("income", "gt100"): "B19001_014E + B19001_015E + B19001_016E"
                        "+ B19001_017E",
    ("cars", "none"): "B08201_002E",
    ("cars", "one"): "B08201_003E",
    ("cars", "two or more"): "B08201_004E + B08201_005E + B08201_006E",
    ("workers", "none"): "B08202_002E",
    ("workers", "one"): "B08202_003E",
    ("workers", "two or more"): "B08202_004E + B08202_005E" 
}, index_cols=['NAME'])
h_acs_cat


Out[9]:
cat_name cars children households income workers
cat_value none one two or more no yes total gt100 gt35-lt100 lt35 none one two or more
NAME
Block Group 1, Census Tract 306, San Francisco County, California 9 91 198 95 205 300 157 74 69 68 119 111
Block Group 2, Census Tract 306, San Francisco County, California 9 83 179 79 194 273 198 62 13 62 109 101
Block Group 3, Census Tract 306, San Francisco County, California 10 93 201 65 240 305 257 29 19 69 121 112

In [10]:
assert np.all(cat.sum_accross_category(h_acs_cat) < 2)

And the same for ACS population - the output of the categorization is the MARGINALS for each variable category


In [11]:
p_acs_cat = cat.categorize(p_acs, {
    ("population", "total"): "B01001_001E",
    ("age", "19 and under"): "B01001_003E + B01001_004E + B01001_005E + "
                             "B01001_006E + B01001_007E + B01001_027E + "
                             "B01001_028E + B01001_029E + B01001_030E + "
                             "B01001_031E",
    ("age", "20 to 35"): "B01001_008E + B01001_009E + B01001_010E + "
                         "B01001_011E + B01001_012E + B01001_032E + "
                         "B01001_033E + B01001_034E + B01001_035E + "
                         "B01001_036E",
    ("age", "35 to 60"): "B01001_013E + B01001_014E + B01001_015E + "
                         "B01001_016E + B01001_017E + B01001_037E + "
                         "B01001_038E + B01001_039E + B01001_040E + "
                         "B01001_041E",
    ("age", "above 60"): "B01001_018E + B01001_019E + B01001_020E + "
                         "B01001_021E + B01001_022E + B01001_023E + "
                         "B01001_024E + B01001_025E + B01001_042E + "
                         "B01001_043E + B01001_044E + B01001_045E + "
                         "B01001_046E + B01001_047E + B01001_048E + "
                         "B01001_049E", 
    ("race", "white"):   "B02001_002E",
    ("race", "black"):   "B02001_003E",
    ("race", "asian"):   "B02001_005E",
    ("race", "other"):   "B02001_004E + B02001_006E + B02001_007E + "
                         "B02001_008E",
    ("sex", "male"):     "B01001_002E",
    ("sex", "female"):   "B01001_026E"
}, index_cols=['NAME'])
p_acs_cat


Out[11]:
cat_name age population race sex
cat_value 19 and under 20 to 35 35 to 60 above 60 total asian black other white female male
NAME
Block Group 1, Census Tract 306, San Francisco County, California 198 132 201 258 789 261 9 152 367 397 392
Block Group 2, Census Tract 306, San Francisco County, California 125 82 313 176 696 179 0 70 447 354 342
Block Group 3, Census Tract 306, San Francisco County, California 291 62 412 174 939 310 12 85 532 421 518

In [12]:
assert np.all(cat.sum_accross_category(p_acs_cat) < 2)

To get the marginals a series for one geography do this


In [13]:
p_acs_cat.iloc[0].transpose()


Out[13]:
cat_name    cat_value   
age         19 and under    198
            20 to 35        132
            35 to 60        201
            above 60        258
population  total           789
race        asian           261
            black             9
            other           152
            white           367
sex         female          397
            male            392
Name: Block Group 1, Census Tract 306, San Francisco County, California, dtype: int32

Now categorize the PUMS population data into the same categories


In [14]:
def age_cat(r):
    if r.AGEP <= 19: return "19 and under"
    elif r.AGEP <= 35: return "20 to 35"
    elif r.AGEP <= 60: return "35 to 60"
    return "above 60"

def race_cat(r):
    if r.RAC1P == 1: return "white"
    elif r.RAC1P == 2: return "black"
    elif r.RAC1P == 6: return "asian"
    return "other"

def sex_cat(r):
    if r.SEX == 1: return "male"
    return "female"

_, jd_persons = cat.joint_distribution(
    p_pums,
    cat.category_combinations(p_acs_cat.columns),
    {"age": age_cat, "race": race_cat, "sex": sex_cat}
)
jd_persons


Out[14]:
cat_id frequency
age race sex
19 and under asian female 0 187
male 1 215
black female 2 16
male 3 15
other female 4 89
male 5 94
white female 6 202
male 7 185
20 to 35 asian female 8 238
male 9 211
black female 10 21
male 11 23
other female 12 59
male 13 67
white female 14 198
male 15 225
35 to 60 asian female 16 448
male 17 378
black female 18 33
male 19 39
other female 20 75
male 21 78
white female 22 373
male 23 453
above 60 asian female 24 233
male 25 182
black female 26 57
male 27 43
other female 28 32
male 29 25
white female 30 354
male 31 294

Do the same for households - the output of this step is the JOINT DISTRIBUTIONS for the cross product of all possible categories


In [16]:
def cars_cat(r):
    if r.VEH == 0: return "none"
    elif r.VEH == 1: return "one"
    return "two or more"

def children_cat(r):
    if r.NOC > 0: return "yes"
    return "no"

def income_cat(r):
    if r.FINCP > 100000: return "gt100"
    elif r.FINCP > 35000: return "gt35-lt100"
    return "lt35"

def workers_cat(r):
    if r.WIF == 3: return "two or more"
    elif r.WIF == 2: return "two or more"
    elif r.WIF == 1: return "one"
    return "none"

_, jd_households = cat.joint_distribution(
    h_pums,
    cat.category_combinations(h_acs_cat.columns),
    {"cars": cars_cat, "children": children_cat, 
     "income": income_cat, "workers": workers_cat}
)
jd_households


Out[16]:
cat_id frequency
cars children income workers
none no gt100 none 0 0.0
one 1 3.0
two or more 2 5.0
gt35-lt100 none 3 3.0
one 4 6.0
two or more 5 10.0
lt35 none 6 174.0
one 7 9.0
two or more 8 1.0
yes gt100 none 9 0.0
one 10 0.0
two or more 11 1.0
gt35-lt100 none 12 0.0
one 13 4.0
two or more 14 2.0
lt35 none 15 2.0
one 16 5.0
two or more 17 0.0
one no gt100 none 18 10.0
one 19 19.0
two or more 20 42.0
gt35-lt100 none 21 27.0
one 22 40.0
two or more 23 47.0
lt35 none 24 404.0
one 25 14.0
two or more 26 2.0
yes gt100 none 27 1.0
one 28 15.0
two or more 29 24.0
gt35-lt100 none 30 0.0
one 31 24.0
two or more 32 21.0
lt35 none 33 5.0
one 34 18.0
two or more 35 2.0
two or more no gt100 none 36 13.0
one 37 46.0
two or more 38 209.0
gt35-lt100 none 39 30.0
one 40 63.0
two or more 41 98.0
lt35 none 42 174.0
one 43 15.0
two or more 44 11.0
yes gt100 none 45 0.0
one 46 43.0
two or more 47 159.0
gt35-lt100 none 48 0.0
one 49 19.0
two or more 50 55.0
lt35 none 51 3.0
one 52 6.0
two or more 53 5.0

With marginals (aggregate, from ACS) and joint distribution (disaggregate, from PUMS) we're ready for some synthesis


In [16]:
"TBD"


Out[16]:
'TBD'

In [ ]: