Idea #1 - comparison of segregation in Boston vs Philly (now and prior)

Ways of measuring:

*Compare percent of different geographies (from blocks to ZIP code) that are predominately one race or ethnicity

*The same but white vs minority in general

*The same as either but also considering population of a tract, and distance to close tract with a predominance that is not your own. Classify the tracts


In [3]:
import charts_function_list
import os
base,data,outputs = charts_function_list.folder_setup()
from census import Census
from us import states
key = key
c = Census(key)

In [ ]:
class state_things(object):
    def __init__(self,state_name):
        
        temp_df = pd.DataFrame(c.acs5.state('NAME', Census.ALL))
        self.state_name = state_name
        self.state_code = temp_df[temp_df['NAME']==self.state_name]['state'].values[0]
        self.county_codes_full = pd.DataFrame(c.acs5.get('NAME', geo={'for': 'county:*',
                       'in': 'state:'+self.state_code}))
        
        self.county_codes = {k:v for k,v in zip(self.county_codes_full['county'],self.county_codes_full['NAME'])}
        
        
    def search_for_county(self,county_name):
        return self.county_codes_full[self.county_codes_full['NAME'].str.contains(county_name)]
    
    
        
    def get_by_tract(self,county_code,additional_fields=None):
        if additional_fields == None:
            tract_df = pd.DataFrame(c.acs5.state_county_tract('NAME', self.state_code,county_code, Census.ALL))
            
        else:
            tract_df = pd.DataFrame(c.acs5.state_county_tract(tuple(['NAME'])+tuple(additional_fields), self.state_code,county_code, Census.ALL))
      
        tract_df['county_map'] = tract_df['county'].map(self.county_codes)
        return tract_df
        
    def get_places(self,additional_fields=None):
        if additional_fields == None:
            place_df = pd.DataFrame(c.acs5.state_place('NAME',self.state_code, place=Census.ALL))

        else:
            place_df = pd.DataFrame(c.acs5.state_place((tuple(['NAME'])+tuple(additional_fields)),
                                                    self.state_code, place= Census.ALL))
        return place_df

#find fields in a particular table and return as a dictionary
def fields_in_table(table_number):
    table_list =  list(filter(lambda x: table_number in x, c.acs5.fields().keys()))
    labels = [c.acs5.fields()[x]['label'] for x in table_list]
    return {k:v for k,v in zip(table_list,labels)}

In [705]:
state_codes = c.acs5.state('NAME', Census.ALL)
state_code_frame = pd.DataFrame(state_codes)
d = {name:state_things(name) for name in state_code_frame['NAME']}

In [709]:
us_places = []
us_tracts = []

#create dictionary to get fields in the Race by Hispanic Origin table
hisp_race_dict = fields_in_table(table_number='B03002')

#take just the keys to generate the tables
fields_we_want = list(hisp_race_dict.keys())

for states in list(d.keys()):
    #pull all the places for each state
    us_places.append(d[states].get_places(additional_fields=fields_we_want))
    
    for counties in list(d[states].county_codes.keys()):
        us_tracts.append(d[states].get_by_tract(county_code=counties,additional_fields=fields_we_want))

place_race_frame = pd.concat(us_places).reset_index(drop=True)
tract_race_frame = pd.concat(us_tracts).reset_index(drop=True)

In [963]:
def reformat_race(df,geo):
    race_mapping = {'B03002_001E':'TOTAL',
                       'B03002_003E':'WHITE', #White = White NH
                       'B03002_004E':'BLACK', #Black = Black NH
                       'B03002_005E':'AK_NH',
                       'B03002_006E':'ASIAN_NH',
                       'B03002_007E':'PI_NH',
                       'B03002_008E':'OTHER_NH',
                       'B03002_009E':'MIX_NH',
                       'B03002_012E':'HISP_ALL',
                       'B03002_013E':'WHITE_H',
                       'B03002_014E':'BLACK_H',
                       'B03002_015E':'AK_H',
                       'B03002_016E':'ASIAN_H',
                       'B03002_017E':'PI_H',
                       'B03002_018E':'OTHER_H',
                       'B03002_019E':'MIX_H'}
    df = df.rename(columns=race_mapping)
    if geo == 'place':
        df = df[list(race_mapping.values())+['NAME', 'place', 'state']]
    elif geo == 'tract':
         df = df[list(race_mapping.values())+['NAME', 'tract', 'state','county','county_map']]
    
    df['AK'] = df['AK_NH']+df['AK_H']
    df['API']= df['ASIAN_H']+df['ASIAN_NH']+df['PI_H']+df['PI_NH']
    df['OTHER_MIX']=df['OTHER_NH']+df['MIX_NH']
    df['HISP'] = df['HISP_ALL']-df['AK_H']-df['PI_H']-df['ASIAN_H']
    def percent_group(column):
        return df[column]/df['TOTAL']
    
    percent_frame = pd.DataFrame({'PER_WHITE':percent_group('WHITE'),
                                 'PER_BLACK':percent_group('BLACK'),
                                 'PER_API':percent_group('API'),
                                 'PER_AKNA':percent_group('AK'),
                                  'PER_HISP':percent_group('HISP'),
                                  'PER_OTHER':percent_group('OTHER_MIX')
                                 
                                 })
    
    df = pd.merge(df,percent_frame,left_index=True,right_index=True)
    df['MAX_ETH_PER']= percent_frame.max(axis=1)
    df['MAX_ETH']= percent_frame.idxmax(axis=1)
    
    def threshold_percent(threshold):
         return np.where(df['MAX_ETH_PER']>=threshold,1,0)
    
    df['Above 70'],df['Above 80'],df['Above 90']=threshold_percent(.7),threshold_percent(.8),threshold_percent(.9)
    
    return df

In [964]:
place_race_frame_full = reformat_race(place_race_frame,geo='place').sort_values(by='MAX_ETH_PER',ascending=False)
tract_race_frame_full = reformat_race(tract_race_frame,geo='tract').sort_values(by='MAX_ETH_PER',ascending=False)
tract_race_frame_full['GEO_ID']=tract_race_frame_full[['state','county','tract']].astype('str').apply(lambda x: x.sum(),axis=1)

In [1008]:
os.chdir(data)
place_race_frame_full.to_csv('2016_ACS_race_place_full.csv')
tract_race_frame_full.to_csv('2016_ACS_race_tract_full.csv')

In [1012]:
tract_race_frame_full.describe()


Out[1012]:
TOTAL WHITE BLACK AK_NH ASIAN_NH PI_NH OTHER_NH MIX_NH HISP_ALL WHITE_H BLACK_H AK_H ASIAN_H PI_H OTHER_H MIX_H AK API OTHER_MIX HISP PER_AKNA PER_API PER_BLACK PER_HISP PER_OTHER PER_WHITE MAX_ETH_PER Above 70 Above 80 Above 90
count 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 74001.000000 73306.000000 73306.000000 73306.000000 73306.000000 73306.000000 73306.000000 73306.000000 74001.000000 74001.000000 74001.000000
mean 4352.475602 2667.386549 528.386441 28.167403 221.992784 6.878421 9.158255 97.391157 793.114593 523.180646 19.740004 7.097404 2.655518 0.691788 201.767746 37.981487 35.264807 232.218511 106.549411 782.669883 0.008880 0.048667 0.132879 0.168072 0.024345 0.617157 0.746134 0.614816 0.456359 0.256497
std 2175.316924 1889.675151 902.984218 178.352648 486.204507 50.717061 33.412553 123.721987 1264.186601 930.526358 75.115015 25.951491 12.788123 8.874696 434.157866 90.022092 183.869191 503.464694 131.112594 1253.893232 0.046403 0.091730 0.217001 0.228248 0.026013 0.306745 0.177724 0.486642 0.498095 0.436702
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.231494 0.000000 0.000000 0.000000
25% 2893.000000 1260.000000 27.000000 0.000000 6.000000 0.000000 0.000000 23.000000 86.000000 49.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.000000 26.000000 84.000000 0.000000 0.002585 0.007570 0.024418 0.007686 0.386574 0.606651 0.000000 0.000000 0.000000
50% 4080.000000 2482.000000 149.000000 0.000000 54.000000 0.000000 0.000000 63.000000 283.000000 175.000000 0.000000 0.000000 0.000000 0.000000 37.000000 7.000000 2.000000 59.000000 70.000000 276.000000 0.000778 0.015626 0.037615 0.070142 0.018087 0.709287 0.777951 1.000000 0.000000 0.000000
75% 5465.000000 3777.000000 606.000000 15.000000 216.000000 0.000000 1.000000 131.000000 883.000000 552.000000 11.000000 0.000000 0.000000 0.000000 188.000000 42.000000 24.000000 228.000000 143.000000 869.000000 0.006017 0.051446 0.144482 0.204618 0.033377 0.881449 0.904068 1.000000 1.000000 1.000000
max 61133.000000 37457.000000 18513.000000 9607.000000 12452.000000 3520.000000 1149.000000 3516.000000 27381.000000 23937.000000 2998.000000 674.000000 819.000000 921.000000 8676.000000 3281.000000 9655.000000 12452.000000 3516.000000 27360.000000 1.000000 0.911438 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

Selecting what cities to feature


In [5]:
os.chdir(data)

In [6]:
place_race_frame_full = pd.read_csv('2016_ACS_race_place_full.csv')
ak_max = place_race_frame_full[(place_race_frame_full['TOTAL']>500000)]['PER_AKNA'].max()
per_other_max = place_race_frame_full[(place_race_frame_full['TOTAL']>500000)]['PER_OTHER'].max()


def max_print(group, minimum):
    location = place_race_frame_full[(place_race_frame_full['MAX_ETH']==group)&(place_race_frame_full['TOTAL']>minimum)].reset_index().loc[0]['NAME']
    value = place_race_frame_full[(place_race_frame_full['MAX_ETH']==group)&(place_race_frame_full['TOTAL']>minimum)].reset_index().loc[0]['MAX_ETH_PER']
    print('MOST '+group+": "+location+" "+format(value*100,'.2f')+'%')
for item in ['PER_WHITE','PER_BLACK','PER_HISP','PER_API']:
    max_print(item,500000)
    
print("---")    
print("No place over 500,000K with majorirty AK/Native American or Other/Mixed Race. Finding the max perctages instead")
print("---")
print('PER_AK: ',place_race_frame_full[place_race_frame_full['PER_AKNA']==ak_max].reset_index().loc[0]['NAME'])
print('PER_OTHER: ',place_race_frame_full[place_race_frame_full['PER_OTHER']==per_other_max].reset_index().loc[0]['NAME'])


MOST PER_WHITE: Portland city, Oregon 71.55%
MOST PER_BLACK: Detroit city, Michigan 79.40%
MOST PER_HISP: El Paso city, Texas 79.76%
MOST PER_API: San Jose city, California 34.54%
---
No place over 500,000K with majorirty AK/Native American or Other/Mixed Race. Finding the max perctages instead
---
PER_AK:  Albuquerque city, New Mexico
PER_OTHER:  Seattle city, Washington

In [11]:
place_race_frame_full[place_race_frame_full['NAME']=='El Paso city, Texas']


Out[11]:
Unnamed: 0 TOTAL WHITE BLACK AK_NH ASIAN_NH PI_NH OTHER_NH MIX_NH HISP_ALL WHITE_H BLACK_H AK_H ASIAN_H PI_H OTHER_H MIX_H NAME place state AK API OTHER_MIX HISP PER_AKNA PER_API PER_BLACK PER_HISP PER_OTHER PER_WHITE MAX_ETH_PER MAX_ETH Above 70 Above 80 Above 90
20125 24973 678058.0 94654.0 22609.0 1594.0 7984.0 991.0 629.0 5576.0 544021.0 472521.0 3313.0 2412.0 590.0 182.0 55500.0 9503.0 El Paso city, Texas 24000 48 4006.0 9747.0 6205.0 540837.0 0.005908 0.014375 0.033344 0.797626 0.009151 0.139596 0.797626 PER_HISP 1 0 0

In [17]:
reload_tract = pd.read_csv('2016_ACS_race_tract_full.csv',index_col=0)

In [20]:
reload_tract[reload_tract['GEO_ID']==41051007400]


Out[20]:
TOTAL WHITE BLACK AK_NH ASIAN_NH PI_NH OTHER_NH MIX_NH HISP_ALL WHITE_H BLACK_H AK_H ASIAN_H PI_H OTHER_H MIX_H NAME tract state county county_map AK API OTHER_MIX HISP PER_AKNA PER_API PER_BLACK PER_HISP PER_OTHER PER_WHITE MAX_ETH_PER MAX_ETH Above 70 Above 80 Above 90 GEO_ID
55105 3995.0 1591.0 846.0 45.0 122.0 19.0 0.0 283.0 1089.0 482.0 0.0 1.0 0.0 0.0 77.0 529.0 Census Tract 74, Multnomah County, Oregon 7400 41 51 Multnomah County, Oregon 46.0 141.0 283.0 1088.0 0.011514 0.035294 0.211765 0.27234 0.070839 0.398248 0.398248 PER_WHITE 0 0 0 41051007400

In [510]:
#percent of population that lives in a tract that is >=x% of one ethnic group
def homogenous_pop(df,threshold_column):
    percent = (df[df[threshold_column]==1]['TOTAL'].sum())/df['TOTAL'].sum()
    return format(percent*100,'.1f')

Some extra stuff


In [ ]:
#For those who are non-white and Hispanic, which to include within the "hispanic" group? Let's assess

for counties in [reformat_race(la_county_data), reformat_race(philly_county_data),reformat_race(suffolk_county_data)]:
    for item in ['White_H','Black_H','AK_H','Asian_H','PI_H','OTHER_H','MIX_H']:
        print(item+' '+str(format(((counties[item]/counties['HISP_All']).mean())*100,'.1f'))+'%')
    print('------')

In [ ]:
## extra tutorial stuff

c.acs5.get(('NAME', table_name),
          {'for': 'state:{}'.format(states.MD.fips)},year=2011) #set different years



table_name ='B25034_010E'

#The get method is the core data access method on both the ACS and SF1 data sets. 
#The first parameter is either a single string column or a tuple of columns. 
#The second parameter is a geoemtry dict with a for key and on option in key. 
c.acs5.get(('NAME', table_name),
          {'for': 'state:{}'.format(states.MD.fips)},year=2011) #set different years

In [ ]:
Some info on Census and US libraries from 

https://github.com/datamade/census

## Description

#### ACS5 
state(fields, state_fips)


state_county(fields, state_fips, county_fips)


state_county_blockgroup(fields, state_fips, county_fips, blockgroup)


state_county_subdivision(fields, state_fips, county_fips, subdiv_fips)


state_county_tract(fields, state_fips, county_fips, tract)


state_place(fields, state_fips, place)


state_district(fields, state_fips, district)


us(fields)

zipcode(fields, zip5)

#### ACS1

ACS1 Geographies

state(fields, state_fips)

state_district(fields, state_fips, district)
us(fields)


## Datasets
*acs5: ACS 5 Year Estimates (2016, 2015, 2014, 2013, 2012, 2011, 2010)

*acs1dp: ACS 1 Year Estimates, Data Profiles (2016, 2015, 2014, 2013, 2012)

*sf1: Census Summary File 1 (2010, 2000, 1990)

*sf3: Census Summary File 3 (2000, 1990)