Goals

  • Learn about how to use the Census variables around Hispanic origin to calculate quantities around diversity (remembering the Racial Dot Map as our framing example)

In [1]:
%pylab --no-import-all inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame, Series, Index
import pandas as pd

from itertools import islice

In [3]:
import census
import us

import settings

The census documentation has example URLs but needs your API key to work. In this notebook, we'll use the IPython notebook HTML display mechanism to help out.


In [4]:
c = census.Census(key=settings.CENSUS_KEY)

In [5]:
# generators for the various census geographic entities of interest

def states(variables='NAME'):
    geo={'for':'state:*'}
    states_fips = set([state.fips for state in us.states.STATES])
    # need to filter out non-states
    for r in c.sf1.get(variables, geo=geo):
        if r['state'] in states_fips:
            yield r
            
def counties(variables='NAME'):
    """ask for all the states in one call"""
    
    # tabulate a set of fips codes for the states
    states_fips = set([s.fips for s in us.states.STATES])
    
    geo={'for':'county:*',
             'in':'state:*'}    
    for county in c.sf1.get(variables, geo=geo):
        # eliminate counties whose states aren't in a state or DC
        if county['state'] in states_fips:
            yield county
        

def counties2(variables='NAME'):
    """generator for all counties"""
    
    # since we can get all the counties in one call, 
    # this function is for demonstrating the use of walking through 
    # the states to get at the counties

    for state in us.states.STATES:
        geo={'for':'county:*',
             'in':'state:{fips}'.format(fips=state.fips)}
        for county in c.sf1.get(variables, geo=geo):
            yield county

            
def tracts(variables='NAME'):
    for state in us.states.STATES:
        
        # handy to print out state to monitor progress
        # print state.fips, state
        counties_in_state={'for':'county:*',
             'in':'state:{fips}'.format(fips=state.fips)}
        
        for county in c.sf1.get('NAME', geo=counties_in_state):
            
            # print county['state'], county['NAME']
            tracts_in_county = {'for':'tract:*',
              'in': 'state:{s_fips} county:{c_fips}'.format(s_fips=state.fips, 
                                                            c_fips=county['county'])}
            
            for tract in c.sf1.get(variables,geo=tracts_in_county):
                yield tract

In [6]:
def block_groups(variables='NAME'):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=block+group:*&in=state:02+county:170
    # let's use the county generator
    for county in counties(variables):
        geo = {'for':'block group:*',
               'in':'state:{state} county:{county}'.format(state=county['state'],
                                                county=county['county'])
               }
        for block_group in c.sf1.get(variables, geo):
            yield block_group
    
    
def blocks(variables='NAME'):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=block:*&in=state:02+county:290+tract:00100
    
    # make use of the tract generator
    for tract in tracts(variables):
        geo={'for':'block:*',
             'in':'state:{state} county:{county} tract:{tract}'.format(state=tract['state'],
                                                                       county=tract['county'],
                                                                       tract=tract['tract'])
             }
        for block in c.sf1.get(variables, geo):
            yield block

In [7]:
# msa, csas, districts, zip_codes

def msas(variables="NAME"):
    
     for state in us.STATES:
        geo = {'for':'metropolitan statistical area/micropolitan statistical area:*', 
               'in':'state:{state_fips}'.format(state_fips=state.fips)
               }
    
        for msa in c.sf1.get(variables, geo=geo):
            yield msa

def csas(variables="NAME"):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=combined+statistical+area:*&in=state:24
    for state in us.STATES:
        geo = {'for':'combined statistical area:*', 
               'in':'state:{state_fips}'.format(state_fips=state.fips)
               }
    
        for csa in c.sf1.get(variables, geo=geo):
            yield csa

def districts(variables="NAME"):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=congressional+district:*&in=state:24
    for state in us.STATES:
        geo = {'for':'congressional district:*', 
               'in':'state:{state_fips}'.format(state_fips=state.fips)
               }
    
        for district in c.sf1.get(variables, geo=geo):
            yield district    
            
def zip_code_tabulation_areas(variables="NAME"):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=zip+code+tabulation+area:*&in=state:02
    for state in us.STATES:
        geo = {'for':'zip code tabulation area:*', 
               'in':'state:{state_fips}'.format(state_fips=state.fips)
               }
    
        for zip_code_tabulation_area in c.sf1.get(variables, geo=geo):
            yield zip_code_tabulation_area

In [8]:
list(islice(msas(), 1))


Out[8]:
[{u'NAME': u'Albertville, AL Micro Area',
  u'metropolitan statistical area/micropolitan statistical area': u'10700',
  u'state': u'01'}]

In [9]:
list(islice(csas(), 1))


Out[9]:
[{u'NAME': u'Atlanta-Sandy Springs-Gainesville, GA-AL CSA (part)',
  u'combined statistical area': u'122',
  u'state': u'01'}]

In [10]:
districts_list = list(islice(districts(), 1))
districts_list


Out[10]:
[{u'NAME': u'Congressional District 1',
  u'congressional district': u'01',
  u'state': u'01'}]

In [11]:
list(islice(zip_code_tabulation_areas(), 1))


Out[11]:
[{u'NAME': u'ZCTA5 30165 (part)',
  u'state': u'01',
  u'zip code tabulation area': u'30165'}]

Note: There are definitely improvements to be made in these generators. One of the most important would be to limit the generators to specific geographies -- typically, we don't want to have all the blocks in the country but the ones in a specific area. A good exercise to rewrite our generators to allow for limited geography.

Hispanic or Latino Origin and Racial Subcategories

http://www.census.gov/developers/data/sf1.xml

compare to http://www.census.gov/prod/cen2010/briefs/c2010br-02.pdf

I think the P0050001 might be the key category

  • P0010001 = P0050001
  • P0050001 = P0050002 + P0050010

P0050002 Not Hispanic or Latino (total) =

  • P0050003 Not Hispanic White only
  • P0050004 Not Hispanic Black only
  • P0050006 Not Hispanic Asian only
  • Not Hispanic Other (should also be P0050002 - (P0050003 + P0050004 + P0050006)

    • P0050005 Not Hispanic: American Indian/ American Indian and Alaska Native alone
    • P0050007 Not Hispanic: Native Hawaiian and Other Pacific Islander alone
    • P0050008 Not Hispanic: Some Other Race alone
    • P0050009 Not Hispanic: Two or More Races
  • P0050010 Hispanic or Latino

P0050010 = P0050011...P0050017

From Hispanic and Latino Americans (Wikipedia):

While the two terms are sometimes used interchangeably, Hispanic is a narrower term which mostly refers to persons of Spanish speaking origin or ancestry, while Latino is more frequently used to refer more generally to anyone of Latin American origin or ancestry, including Brazilians.

and

The Census Bureau's 2010 census does provide a definition of the terms Latino or Hispanic and is as follows: “Hispanic or Latino” refers to a person of Cuban, Mexican, Puerto Rican, South or Central American, or other Spanish culture or origin regardless of race. It allows respondents to self-define whether they were Latino or Hispanic and then identify their specific country or place of origin.[52] On its website, the Census Bureau defines "Hispanic" or "Latino" persons as being "persons who trace their origin [to]... Spanish speaking Central and South America countries, and other Spanish cultures".

In the Racial Dot Map: "Whites are coded as blue; African-Americans, green; Asians, red; Hispanics, orange; and all other racial categories are coded as brown."

In this notebook, we will relate the Racial Dot Map 5-category scheme to the P005* variables.


In [12]:
# let's get the total population -- tabulated in two variables: P0010001, P0050001
# P0050002 Not Hispanic or Latino (total) 
# P0050010 Hispanic or Latino

r = list(states(('NAME','P0010001','P0050001','P0050002','P0050010')))
r[:5]


Out[12]:
[{u'NAME': u'Alabama',
  u'P0010001': u'4779736',
  u'P0050001': u'4779736',
  u'P0050002': u'4594134',
  u'P0050010': u'185602',
  u'state': u'01'},
 {u'NAME': u'Alaska',
  u'P0010001': u'710231',
  u'P0050001': u'710231',
  u'P0050002': u'670982',
  u'P0050010': u'39249',
  u'state': u'02'},
 {u'NAME': u'Arizona',
  u'P0010001': u'6392017',
  u'P0050001': u'6392017',
  u'P0050002': u'4496868',
  u'P0050010': u'1895149',
  u'state': u'04'},
 {u'NAME': u'Arkansas',
  u'P0010001': u'2915918',
  u'P0050001': u'2915918',
  u'P0050002': u'2729868',
  u'P0050010': u'186050',
  u'state': u'05'},
 {u'NAME': u'California',
  u'P0010001': u'37253956',
  u'P0050001': u'37253956',
  u'P0050002': u'23240237',
  u'P0050010': u'14013719',
  u'state': u'06'}]

In [13]:
# Hispanic/Latino origin vs not-Hispanic/Latino
# Compare with http://www.census.gov/prod/cen2010/briefs/c2010br-02.pdf Table 1
# Hispanic/Latino: 50477594
# non-Hispanic/Latino: 258267944

df=DataFrame(r)
df[['P0010001', 'P0050001','P0050002','P0050010']] = \
    df[['P0010001', 'P0050001','P0050002','P0050010']].astype('int')
df[['P0010001', 'P0050001', 'P0050002', 'P0050010']].sum()


Out[13]:
P0010001    308745538
P0050001    308745538
P0050002    258267944
P0050010     50477594
dtype: int64

In [14]:
# is the total Hispanic/Latino population and non-Hispanic populations the same as reported in 
# http://www.census.gov/prod/cen2010/briefs/c2010br-02.pdf Table 1
(df['P0050010'].sum() == 50477594,
 df['P0050002'].sum() == 258267944)


Out[14]:
(True, True)

In [15]:
# How about the non-Hispanic/Latino White only category?
# P0050003
# total should be 196817552

df = DataFrame(list(states('NAME,P0050003')))
df['P0050003'] = df['P0050003'].astype('int')
df.P0050003.sum()


Out[15]:
196817552

Converting to Racial Dot Map Categories

SUGGESTED EXERCISE: write a function convert_to_rdotmap(row) tha takes an input Python dict that has the keys:

* NAME
* P005001, P005002...,P0050016, P0050017 

and that returns a Pandas Series with the following columns:

* Total
* White
* Black
* Asian
* Hispanic
* Other
* Name  (note lowercase)

that correspond to those used in the Racial Dot Map.

Also write a function def convert_P005_to_int(df) that converts all the P005* columns to int


In [16]:
# USE a little convience function to calculate the variable names to be used

def P005_range(n0,n1): 
    return tuple(('P005'+ "{i:04d}".format(i=i) for i in xrange(n0,n1)))

P005_vars = P005_range(1,18)
P005_vars_str = ",".join(P005_vars)
P005_vars_with_name = ['NAME'] + list(P005_vars)

P005_vars_with_name


Out[16]:
['NAME',
 'P0050001',
 'P0050002',
 'P0050003',
 'P0050004',
 'P0050005',
 'P0050006',
 'P0050007',
 'P0050008',
 'P0050009',
 'P0050010',
 'P0050011',
 'P0050012',
 'P0050013',
 'P0050014',
 'P0050015',
 'P0050016',
 'P0050017']

In [17]:
# HAVE YOU TRIED THE EXERCISE....IF NOT....TRY IT....HERE'S ONE POSSIBLE ANSWER# 

# http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/#create

def convert_P005_to_int(df):
    # do conversion in place
    df[list(P005_vars)] = df[list(P005_vars)].astype('int')
    return df

def convert_to_rdotmap(row):
    """takes the P005 variables and maps to a series with White, Black, Asian, Hispanic, Other
    Total and Name"""
    return pd.Series({'Total':row['P0050001'],
                      'White':row['P0050003'],
                      'Black':row['P0050004'],
                      'Asian':row['P0050006'],
                      'Hispanic':row['P0050010'],
                      'Other': row['P0050005'] + row['P0050007'] + row['P0050008'] + row['P0050009'],
                      'Name': row['NAME']
                      }, index=['Name', 'Total', 'White', 'Black', 'Hispanic', 'Asian', 'Other'])

In [18]:
from census import Census

import settings
from settings import CENSUS_KEY

import time
from itertools import islice

def P005_range(n0,n1): 
    return tuple(('P005'+ "{i:04d}".format(i=i) for i in xrange(n0,n1)))

P005_vars = P005_range(1,18)
P005_vars_str = ",".join(P005_vars)


# http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/#create
def convert_to_rdotmap(row):
    """takes the P005 variables and maps to a series with White, Black, Asian, Hispanic, Other
    Total and Name"""
    return pd.Series({'Total':row['P0050001'],
                      'White':row['P0050003'],
                      'Black':row['P0050004'],
                      'Asian':row['P0050006'],
                      'Hispanic':row['P0050010'],
                      'Other': row['P0050005'] + row['P0050007'] + row['P0050008'] + row['P0050009'],
                      'Name': row['NAME']
                      }, index=['Name', 'Total', 'White', 'Black', 'Hispanic', 'Asian', 'Other'])


def normalize(s):
    """take a Series and divide each item by the sum so that the new series adds up to 1.0"""
    total = np.sum(s)
    return s.astype('float') / total


def entropy(series):
    """Normalized Shannon Index"""
    # a series in which all the entries are equal should result in normalized entropy of 1.0
    
    # eliminate 0s
    series1 = series[series!=0]

    # if len(series) < 2 (i.e., 0 or 1) then return 0
    
    if len(series) > 1:
        # calculate the maximum possible entropy for given length of input series
        max_s = -np.log(1.0/len(series))
    
        total = float(sum(series1))
        p = series1.astype('float')/float(total)
        return sum(-p*np.log(p))/max_s
    else:
        return 0.0

    
def convert_P005_to_int(df):
    # do conversion in place
    df[list(P005_vars)] = df[list(P005_vars)].astype('int')
    return df
    

def diversity(r):

    """Returns a DataFrame with the following columns
    """
    df = DataFrame(r)
    df = convert_P005_to_int(df)
    # df[list(P005_vars)] = df[list(P005_vars)].astype('int')
    df1 = df.apply(convert_to_rdotmap, axis=1)
    
    df1['entropy5'] = df1[['Asian','Black','Hispanic','White','Other']].apply(entropy,axis=1)
    df1['entropy4'] = df1[['Asian','Black','Hispanic','White']].apply(entropy,axis=1)
    return df1

In [19]:
# states

r=list(states(P005_vars_with_name))
diversity(r)


Out[19]:
Name Total White Black Hispanic Asian Other entropy5 entropy4
0 Alabama 4779736 3204402 1244437 185602 52937 92358 0.541001 0.570292
1 Alaska 710231 455320 21949 39249 37459 156254 0.646677 0.475235
2 Arizona 6392017 3695647 239101 1895149 170509 391611 0.663524 0.643529
3 Arkansas 2915918 2173469 447102 186050 35647 73650 0.515025 0.526205
4 California 37253956 14956253 2163804 14013719 4775070 1345110 0.796994 0.843670
5 Colorado 5029196 3520793 188778 1038687 135564 145374 0.558232 0.570130
6 Connecticut 3574097 2546262 335119 479087 134091 79538 0.584509 0.615330
7 Delaware 897934 586752 186782 73221 28308 22871 0.628490 0.660917
8 District of Columbia 601723 209464 301053 54749 20818 15639 0.710288 0.757369
9 Florida 18801310 10884722 2851100 4223806 445216 396466 0.688393 0.741076
10 Georgia 9687653 5413920 2910800 853689 311692 197552 0.677545 0.729666
11 Hawaii 1360301 309343 19904 120842 513294 396918 0.833108 0.750762
12 Idaho 1567582 1316243 8875 175901 18529 48034 0.360829 0.330227
13 Illinois 12830632 8167753 1832924 2027578 580586 221791 0.663131 0.719347
14 Indiana 6483802 5286453 582140 389707 101444 124058 0.430342 0.439752
15 Iowa 3046355 2701123 86906 151544 52597 54185 0.310137 0.300998
16 Kansas 2853118 2230539 162700 300042 66967 92870 0.492215 0.483675
17 Kentucky 4339367 3745655 333075 132836 48338 79463 0.344293 0.340010
18 Louisiana 4533372 2734884 1442420 192560 69327 94181 0.588919 0.623788
19 Maine 1328361 1254297 15154 16935 13442 28533 0.180061 0.137155
20 Maryland 5773552 3157958 1674229 470632 316694 154039 0.714090 0.760596
21 Massachusetts 6547629 4984800 391693 627654 347495 195987 0.535423 0.540767
22 Michigan 9883640 7569939 1383756 436358 236490 257097 0.498010 0.504299
23 Minnesota 5303925 4405142 269141 250258 212996 166388 0.427024 0.407947
24 Mississippi 2967297 1722287 1093512 81481 25477 44540 0.550642 0.591949
25 Missouri 5988927 4850748 687149 212470 97221 141339 0.430525 0.429356
26 Montana 989415 868628 3743 28565 6138 82341 0.295872 0.149198
27 Nebraska 1826341 1499753 80959 167405 31919 46305 0.424281 0.417907
28 Nevada 2700551 1462081 208058 716501 191047 122864 0.751622 0.774363
29 New Hampshire 1316470 1215050 13625 36704 28241 22850 0.232308 0.210183
30 New Jersey 8791894 5214878 1125401 1555144 719827 176644 0.722462 0.783517
31 New Mexico 2059179 833810 35462 953403 26305 210199 0.671781 0.603770
32 New York 19378102 11304247 2783857 3416922 1406194 466882 0.732727 0.787727
33 North Carolina 9535483 6223995 2019854 800120 206579 284935 0.623233 0.645955
34 North Dakota 672591 598007 7720 13467 6839 46558 0.289289 0.165826
35 Ohio 11536504 9359263 1389115 354674 190765 242687 0.422934 0.426370
36 Oklahoma 3751351 2575381 272071 332007 64154 507738 0.623426 0.506346
37 Oregon 3831074 3005848 64984 450062 139436 170744 0.478609 0.444008
38 Pennsylvania 12702379 10094652 1327091 719660 346288 214688 0.465015 0.486249
39 Rhode Island 1052567 803685 51560 130655 29988 36679 0.516377 0.508129
40 South Carolina 4625364 2962740 1279998 235682 58307 88637 0.573768 0.609445
41 South Dakota 814180 689502 9959 22119 7553 85047 0.355383 0.191061
42 Tennessee 6346105 4800782 1049391 290059 90311 115562 0.486619 0.508575
43 Texas 25145561 11397345 2886825 9460921 948426 452044 0.727466 0.793870
44 Utah 2763885 2221719 25951 358340 54176 103699 0.425283 0.393087
45 Vermont 625741 590223 5943 9208 7875 12492 0.183061 0.144800
46 Virginia 8001024 5186450 1523704 631825 436298 222747 0.655915 0.688954
47 Washington 6724540 4876804 229603 755790 475634 386709 0.587508 0.555274
48 West Virginia 1852994 1726256 62122 22268 12285 30063 0.206960 0.183409
49 Wisconsin 5686986 4738411 350898 336056 128052 133569 0.412929 0.408698
50 Wyoming 563626 483874 4351 50231 4279 20891 0.337501 0.288172

51 rows × 9 columns


In [20]:
# counties

r = list(counties(P005_vars_with_name))

In [21]:
df2 = diversity(r)

In [2]:
df2.sort_index(by='entropy5',ascending=False)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-04254caa1209> in <module>()
----> 1 df2.sort_index(by='entropy5',ascending=False)

NameError: name 'df2' is not defined

In [25]:
msas_list = list(islice(msas('NAME,P0010001'),None))

In [26]:
len(msas_list)


Out[26]:
1013

In [27]:
df = DataFrame(msas_list)

In [29]:
df.P0010001 = df.P0010001.astype('int')

In [34]:
df.groupby('metropolitan statistical area/micropolitan statistical area').apply(lambda x:sum(x['P0010001']))


Out[34]:
metropolitan statistical area/micropolitan statistical area
10020                                                           57999
10100                                                           40602
10140                                                           72797
10180                                                          165252
10220                                                           37492
10300                                                           99892
10420                                                          703200
10460                                                           63797
10500                                                          157308
10540                                                          116672
10580                                                          870716
10620                                                           60585
10660                                                           31255
10700                                                           93019
10740                                                          887077
...
49060                                                           36311
49100                                                           51461
49180                                                          477717
49260                                                           20081
49300                                                          114520
49340                                                          798552
49380                                                           21378
49420                                                          243231
49460                                                           22438
49540                                                           28065
49620                                                          434972
49660                                                          565773
49700                                                          166892
49740                                                          195751
49780                                                           86074
Length: 942, dtype: int64

In [1]:
type(r)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-a1faf272f0d7> in <module>()
----> 1 type(r)

NameError: name 'r' is not defined