In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
%matplotlib inline
sns.set_context('notebook')
In [2]:
pkg = mp.jupyter.open_package()
#pkg = mp.jupyter.open_source_package()
pkg
Out[2]:
In [3]:
hs = pkg.resource('housing').dataframe()
pop = pkg.resource('population').dataframe()
In [4]:
cols = [e.lower() for e in ['SERIALNO', 'SPORDER', 'PUMA','SEX', 'AGEP',
'RAC1P', 'HISP', 'POVPIP', 'PINCP', 'SCHL']]
weight_cols = [ c for c in pop.columns if c.startswith('pwgtp')]
dfx = pop[cols+weight_cols]
RAC1P
Recoded detailed race code
1 .White alone
2 .Black or African American alone
3 .American Indian alone
4 .Alaska Native alone
5 .American Indian and Alaska Native tribes specified; or
.American Indian or Alaska Native, not specified and no
.other races
6 .Asian alone
7 .Native Hawaiian and Other Pacific Islander alone
8 .Some Other Race alone
9 .Two or More Races
In [5]:
rac1p_map = {
1: 'white',
2: 'black',
3: 'amind',
4: 'alaskanat',
5: 'aian',
6: 'asian',
7: 'nhopi',
8: 'other',
9: 'many'
}
pop['race'] = pop.rac1p.astype('category')
pop['race'] = pop.race.cat.rename_categories(rac1p_map)
# The raceeth variable is the race varaiable, but with 'white' replaced
# with 'hisp' for records that have both is_hsip and white set. So, for
# raceeth, 'white' means 'non-hispanic white'
pop['is_hisp'] = pop.hisp != 1
pop['raceeth'] = pop['race'].mask(((pop.is_hisp == True) & (pop.race == 'white')), 'hisp')
pop[['rac1p','race','is_hisp','raceeth']].head()
Out[5]:
In [6]:
pop[pop.raceeth == 'white'].agep.hist()
Out[6]:
In [7]:
pop[pop.raceeth == 'hisp'].agep.hist()
Out[7]:
In [8]:
ages = ['18-25 YEARS',
'26-29 YEARS',
'30-34 YEARS',
'35-39 YEARS',
'40-44 YEARS',
'45-49 YEARS',
'50-54 YEARS',
'55-59 YEARS',
'60-64 YEARS',
'65-69 YEARS',
'70-74 YEARS',
'75-79 YEARS',
'80-84 YEARS',
'85+ YEARS']
def extract_age(v):
if v.startswith('85'):
return pd.Interval(left=85, right=120, closed='both')
else:
l,h,_ = v.replace('-',' ').split()
return pd.Interval(left=int(l), right=int(h), closed='both')
age_ranges = [ (extract_age(v), v) for v in ages]
age_index = pd.IntervalIndex(list(ar[0] for ar in age_ranges))
pop['age_group'] = pd.cut(pop.agep,age_index).astype('category')
pop['age_group'].cat.rename_categories(dict(age_ranges), inplace=True)
pop[['agep','age_group']].head()
Out[8]:
In [9]:
povlvls = ['0-99% FPL', '100-199% FPL', '200-299% FPL', '300% FPL AND ABOVE']
pov_index = pd.IntervalIndex(
[pd.Interval(left=0, right=99, closed='both'),
pd.Interval(left=100, right=199, closed='both'),
pd.Interval(left=200, right=299, closed='both'),
pd.Interval(left=300, right=501, closed='both')]
)
In [10]:
pop.povpip.describe()
Out[10]:
In [11]:
pop['pov_group'] = pd.cut(pop.povpip,pov_index).astype('category')
pop['pov_group'].cat.rename_categories(dict(zip(pov_index, povlvls)), inplace=True)
pop[['povpip','pov_group']].head()
Out[11]:
In [12]:
pop.groupby('puma').pwgtp5.sum().sum()
Out[12]:
In [13]:
dfx = pop[cols+['age_group','pov_group','race','is_hisp','raceeth']+weight_cols]
dfx.head(20).T
len(dfx)
Out[13]:
In [14]:
def build_set(df, rep_no):
new_rows = []
for row in df.iterrows():
repl = row[1].at['pwgtp'+str(rep_no)]
if repl > 1:
new_rows.extend([row]*(repl-1))
return new_rows
In [15]:
%time new_rows = build_set(dfx, 1)
In [16]:
%time t = dfx.copy().append(new_rows, ignore_index = True)
In [17]:
len(t)
Out[17]:
In [18]:
t
Out[18]:
In [19]:
from publicdata import parse_app_url
url = parse_app_url('census://2015/5/CA/140/B17001')
dfc = url.geoframe()
In [20]:
dfc.plot()
Out[20]:
In [21]:
# The puma files moved, so the publicdata package is wrong.
url = parse_app_url('shape+ftp://ftp2.census.gov/geo/tiger/TIGER2018/PUMA/tl_2018_06_puma10.zip')
pumas = url.get_resource().geoframe()
In [22]:
pumas.plot()
Out[22]:
In [23]:
url = parse_app_url('census://2015/5/CA/county/B17001')
url.geo_url.shape_url
Out[23]:
In [24]:
counties_pkg = mp.open_package('http://library.metatab.org/census.gov-counties-2017-2.csv')
counties = counties_pkg.resource('counties').geoframe()
In [25]:
sd = counties[counties.name == 'San Diego']
In [26]:
#import geopandas as gpd
#gpd.sjoin(pumas, sd)
In [ ]:
In [ ]: