Ways of measuring:
*Compare percent of different geographies (from blocks to ZIP code) that are predominately one race or ethnicity
*The same but white vs minority in general
*The same as either but also considering population of a tract, and distance to close tract with a predominance that is not your own. Classify the tracts
In [3]:
import charts_function_list
import os
base,data,outputs = charts_function_list.folder_setup()
from census import Census
from us import states
key = key
c = Census(key)
In [ ]:
class state_things(object):
def __init__(self,state_name):
temp_df = pd.DataFrame(c.acs5.state('NAME', Census.ALL))
self.state_name = state_name
self.state_code = temp_df[temp_df['NAME']==self.state_name]['state'].values[0]
self.county_codes_full = pd.DataFrame(c.acs5.get('NAME', geo={'for': 'county:*',
'in': 'state:'+self.state_code}))
self.county_codes = {k:v for k,v in zip(self.county_codes_full['county'],self.county_codes_full['NAME'])}
def search_for_county(self,county_name):
return self.county_codes_full[self.county_codes_full['NAME'].str.contains(county_name)]
def get_by_tract(self,county_code,additional_fields=None):
if additional_fields == None:
tract_df = pd.DataFrame(c.acs5.state_county_tract('NAME', self.state_code,county_code, Census.ALL))
else:
tract_df = pd.DataFrame(c.acs5.state_county_tract(tuple(['NAME'])+tuple(additional_fields), self.state_code,county_code, Census.ALL))
tract_df['county_map'] = tract_df['county'].map(self.county_codes)
return tract_df
def get_places(self,additional_fields=None):
if additional_fields == None:
place_df = pd.DataFrame(c.acs5.state_place('NAME',self.state_code, place=Census.ALL))
else:
place_df = pd.DataFrame(c.acs5.state_place((tuple(['NAME'])+tuple(additional_fields)),
self.state_code, place= Census.ALL))
return place_df
#find fields in a particular table and return as a dictionary
def fields_in_table(table_number):
table_list = list(filter(lambda x: table_number in x, c.acs5.fields().keys()))
labels = [c.acs5.fields()[x]['label'] for x in table_list]
return {k:v for k,v in zip(table_list,labels)}
In [705]:
state_codes = c.acs5.state('NAME', Census.ALL)
state_code_frame = pd.DataFrame(state_codes)
d = {name:state_things(name) for name in state_code_frame['NAME']}
In [709]:
us_places = []
us_tracts = []
#create dictionary to get fields in the Race by Hispanic Origin table
hisp_race_dict = fields_in_table(table_number='B03002')
#take just the keys to generate the tables
fields_we_want = list(hisp_race_dict.keys())
for states in list(d.keys()):
#pull all the places for each state
us_places.append(d[states].get_places(additional_fields=fields_we_want))
for counties in list(d[states].county_codes.keys()):
us_tracts.append(d[states].get_by_tract(county_code=counties,additional_fields=fields_we_want))
place_race_frame = pd.concat(us_places).reset_index(drop=True)
tract_race_frame = pd.concat(us_tracts).reset_index(drop=True)
In [963]:
def reformat_race(df,geo):
race_mapping = {'B03002_001E':'TOTAL',
'B03002_003E':'WHITE', #White = White NH
'B03002_004E':'BLACK', #Black = Black NH
'B03002_005E':'AK_NH',
'B03002_006E':'ASIAN_NH',
'B03002_007E':'PI_NH',
'B03002_008E':'OTHER_NH',
'B03002_009E':'MIX_NH',
'B03002_012E':'HISP_ALL',
'B03002_013E':'WHITE_H',
'B03002_014E':'BLACK_H',
'B03002_015E':'AK_H',
'B03002_016E':'ASIAN_H',
'B03002_017E':'PI_H',
'B03002_018E':'OTHER_H',
'B03002_019E':'MIX_H'}
df = df.rename(columns=race_mapping)
if geo == 'place':
df = df[list(race_mapping.values())+['NAME', 'place', 'state']]
elif geo == 'tract':
df = df[list(race_mapping.values())+['NAME', 'tract', 'state','county','county_map']]
df['AK'] = df['AK_NH']+df['AK_H']
df['API']= df['ASIAN_H']+df['ASIAN_NH']+df['PI_H']+df['PI_NH']
df['OTHER_MIX']=df['OTHER_NH']+df['MIX_NH']
df['HISP'] = df['HISP_ALL']-df['AK_H']-df['PI_H']-df['ASIAN_H']
def percent_group(column):
return df[column]/df['TOTAL']
percent_frame = pd.DataFrame({'PER_WHITE':percent_group('WHITE'),
'PER_BLACK':percent_group('BLACK'),
'PER_API':percent_group('API'),
'PER_AKNA':percent_group('AK'),
'PER_HISP':percent_group('HISP'),
'PER_OTHER':percent_group('OTHER_MIX')
})
df = pd.merge(df,percent_frame,left_index=True,right_index=True)
df['MAX_ETH_PER']= percent_frame.max(axis=1)
df['MAX_ETH']= percent_frame.idxmax(axis=1)
def threshold_percent(threshold):
return np.where(df['MAX_ETH_PER']>=threshold,1,0)
df['Above 70'],df['Above 80'],df['Above 90']=threshold_percent(.7),threshold_percent(.8),threshold_percent(.9)
return df
In [964]:
place_race_frame_full = reformat_race(place_race_frame,geo='place').sort_values(by='MAX_ETH_PER',ascending=False)
tract_race_frame_full = reformat_race(tract_race_frame,geo='tract').sort_values(by='MAX_ETH_PER',ascending=False)
tract_race_frame_full['GEO_ID']=tract_race_frame_full[['state','county','tract']].astype('str').apply(lambda x: x.sum(),axis=1)
In [1008]:
os.chdir(data)
place_race_frame_full.to_csv('2016_ACS_race_place_full.csv')
tract_race_frame_full.to_csv('2016_ACS_race_tract_full.csv')
In [1012]:
tract_race_frame_full.describe()
Out[1012]:
In [5]:
os.chdir(data)
In [6]:
place_race_frame_full = pd.read_csv('2016_ACS_race_place_full.csv')
ak_max = place_race_frame_full[(place_race_frame_full['TOTAL']>500000)]['PER_AKNA'].max()
per_other_max = place_race_frame_full[(place_race_frame_full['TOTAL']>500000)]['PER_OTHER'].max()
def max_print(group, minimum):
location = place_race_frame_full[(place_race_frame_full['MAX_ETH']==group)&(place_race_frame_full['TOTAL']>minimum)].reset_index().loc[0]['NAME']
value = place_race_frame_full[(place_race_frame_full['MAX_ETH']==group)&(place_race_frame_full['TOTAL']>minimum)].reset_index().loc[0]['MAX_ETH_PER']
print('MOST '+group+": "+location+" "+format(value*100,'.2f')+'%')
for item in ['PER_WHITE','PER_BLACK','PER_HISP','PER_API']:
max_print(item,500000)
print("---")
print("No place over 500,000K with majorirty AK/Native American or Other/Mixed Race. Finding the max perctages instead")
print("---")
print('PER_AK: ',place_race_frame_full[place_race_frame_full['PER_AKNA']==ak_max].reset_index().loc[0]['NAME'])
print('PER_OTHER: ',place_race_frame_full[place_race_frame_full['PER_OTHER']==per_other_max].reset_index().loc[0]['NAME'])
In [11]:
place_race_frame_full[place_race_frame_full['NAME']=='El Paso city, Texas']
Out[11]:
In [17]:
reload_tract = pd.read_csv('2016_ACS_race_tract_full.csv',index_col=0)
In [20]:
reload_tract[reload_tract['GEO_ID']==41051007400]
Out[20]:
In [510]:
#percent of population that lives in a tract that is >=x% of one ethnic group
def homogenous_pop(df,threshold_column):
percent = (df[df[threshold_column]==1]['TOTAL'].sum())/df['TOTAL'].sum()
return format(percent*100,'.1f')
In [ ]:
#For those who are non-white and Hispanic, which to include within the "hispanic" group? Let's assess
for counties in [reformat_race(la_county_data), reformat_race(philly_county_data),reformat_race(suffolk_county_data)]:
for item in ['White_H','Black_H','AK_H','Asian_H','PI_H','OTHER_H','MIX_H']:
print(item+' '+str(format(((counties[item]/counties['HISP_All']).mean())*100,'.1f'))+'%')
print('------')
In [ ]:
## extra tutorial stuff
c.acs5.get(('NAME', table_name),
{'for': 'state:{}'.format(states.MD.fips)},year=2011) #set different years
table_name ='B25034_010E'
#The get method is the core data access method on both the ACS and SF1 data sets.
#The first parameter is either a single string column or a tuple of columns.
#The second parameter is a geoemtry dict with a for key and on option in key.
c.acs5.get(('NAME', table_name),
{'for': 'state:{}'.format(states.MD.fips)},year=2011) #set different years
In [ ]:
Some info on Census and US libraries from
https://github.com/datamade/census
## Description
#### ACS5
state(fields, state_fips)
state_county(fields, state_fips, county_fips)
state_county_blockgroup(fields, state_fips, county_fips, blockgroup)
state_county_subdivision(fields, state_fips, county_fips, subdiv_fips)
state_county_tract(fields, state_fips, county_fips, tract)
state_place(fields, state_fips, place)
state_district(fields, state_fips, district)
us(fields)
zipcode(fields, zip5)
#### ACS1
ACS1 Geographies
state(fields, state_fips)
state_district(fields, state_fips, district)
us(fields)
## Datasets
*acs5: ACS 5 Year Estimates (2016, 2015, 2014, 2013, 2012, 2011, 2010)
*acs1dp: ACS 1 Year Estimates, Data Profiles (2016, 2015, 2014, 2013, 2012)
*sf1: Census Summary File 1 (2010, 2000, 1990)
*sf3: Census Summary File 3 (2000, 1990)