In [13]:
import numpy as np
import os
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
# Show plots inline in notebook
%matplotlib inline
In [2]:
common_data_path = 'Data/USCensusACS/IncomeAndOccupancySeattle/'
# Read in the census data, using the GEO.id column as the index
df2012 = pd.read_csv(os.path.join(common_data_path, 'ACS_12_1YR_B25118_with_ann.csv'),
index_col='GEO.id')
#print(df2012)
df2013 = pd.read_csv(os.path.join(common_data_path, 'ACS_13_1YR_B25118_with_ann.csv'),
index_col='GEO.id')
#print(df2012)
df2014 = pd.read_csv(os.path.join(common_data_path, 'ACS_14_1YR_B25118_with_ann.csv'),
index_col='GEO.id')
In [3]:
print(df2012['HD01_VD03'])
In [4]:
type(df2012)
Out[4]:
In [5]:
df2012.index
Out[5]:
In [6]:
seattle2012 = df2012.loc['1600000US5363000']
print(seattle2012[['HD01_VD01']])
type(seattle2012)
Out[6]:
In [36]:
def income_bracket_buckets(df):
""" Return two Series of Seattle renters by income bracket, and owners by income bracket """
seattle_row = df.loc['1600000US5363000']
#print("ibb: seattle_row={0}".format(seattle_row))
#print("ibb: read csv default type = {0}".format(type(seattle_row.HD01_VD13)))
##
# Owner
seattle_owner = Series([0] * 5,
#index=['>$150K', '$100K-$150K', '$75K-$100K', '$50K-$75K', '<$50K'])
index=['<$50K', '$50K-$75K','$75K-$100K','$100K-$150K', '>$150K'])
seattle_owner['>$150K'] = int(seattle_row.HD01_VD13)
seattle_owner['$100K-$150K'] = int(seattle_row.HD01_VD12)
seattle_owner['$75K-$100K'] = int(seattle_row.HD01_VD11)
seattle_owner['$50K-$75K'] = int(seattle_row.HD01_VD10)
# Under 50K has multiple sub-buckets that will be combined into one here
owner_under50K_cols = ['HD01_VD03', 'HD01_VD04', 'HD01_VD05', 'HD01_VD06',
'HD01_VD07', 'HD01_VD08', 'HD01_VD09']
# Convert <50K cells from string to integer before summing
seattle_row[owner_under50K_cols] = seattle_row[owner_under50K_cols].astype('int')
seattle_owner['<$50K'] = seattle_row[owner_under50K_cols].values.sum()
#print(seattle_owner)
##
# Renter
seattle_renter = Series([0] * 5,
#index=['>$150K', '$100K-$150K', '$75K-$100K', '$50K-$75K', '<$50K'])
index=['<$50K', '$50K-$75K','$75K-$100K','$100K-$150K', '>$150K'])
seattle_renter['>$150K'] = int(seattle_row.HD01_VD25)
seattle_renter['$100K-$150K'] = int(seattle_row.HD01_VD24)
seattle_renter['$75K-$100K'] = int(seattle_row.HD01_VD23)
seattle_renter['$50K-$75K'] = int(seattle_row.HD01_VD22)
# Under 50K has multiple sub-buckets that will be combined into one here
renter_under50K_cols = ['HD01_VD15', 'HD01_VD16', 'HD01_VD17', 'HD01_VD18',
'HD01_VD19', 'HD01_VD20', 'HD01_VD21']
# Convert <50K cells from string to integer before summing
seattle_row[renter_under50K_cols] = seattle_row[renter_under50K_cols].astype('int')
seattle_renter['<$50K'] = seattle_row[renter_under50K_cols].values.sum()
return (seattle_owner, seattle_renter)
In [37]:
owner2012, renter2012 = income_bracket_buckets(df2012)
owner2013, renter2013 = income_bracket_buckets(df2013)
owner2014, renter2014 = income_bracket_buckets(df2014)
print("Owner2012:\n{0}".format(owner2012))
print("Owner2013:\n{0}".format(owner2013))
print("Owner2014:\n{0}".format(owner2014))
print("Renter2012:\n{0}".format(renter2012))
print("Renter2013:\n{0}".format(renter2013))
print("Renter2014:\n{0}".format(renter2014))
In [38]:
# Merge tables into dataframe
seattleOwnerRenter = DataFrame({'Owner2012': owner2012,
'Owner2013': owner2013,
'Owner2014': owner2014,
'Renter2012': renter2012,
'Renter2013': renter2013,
'Renter2014': renter2014})
print(seattleOwnerRenter)
In [39]:
# Add columns with percentage change 2012:2013 and 2013:2014, renters and owners
sor = seattleOwnerRenter
renterPctChg12_13 = (sor.Renter2013 - sor.Renter2012) / sor.Renter2012 * 100
renterPctChg13_14 = (sor.Renter2014 - sor.Renter2013) / sor.Renter2013 * 100
ownerPctChg12_13 = (sor.Owner2013 - sor.Owner2012) / sor.Owner2012 * 100
ownerPctChg13_14 = (sor.Owner2014 - sor.Owner2013) / sor.Owner2013 * 100
seattleOwnerRenter['OwnerPctChg2012_2013'] = ownerPctChg12_13
seattleOwnerRenter['OwnerPctChg2013_2014'] = ownerPctChg13_14
seattleOwnerRenter['RenterPctChg2012_2013'] = renterPctChg12_13
seattleOwnerRenter['RenterPctChg2013_2014'] = renterPctChg13_14
#print("{0:.2f}".format(seattleOwnerRenter))
pd.options.display.float_format = '{:,.2f}'.format
print(seattleOwnerRenter)
In [40]:
type(seattleOwnerRenter.RenterPctChg2012_2013['>$150K'])
Out[40]:
In [41]:
# Test use of matplotlib inside python notebook
# Make a bar plot of a small series of 10 random values centered at 0
seedval=206
np.random.seed(seedval)
s = Series(np.random.rand(10) - 0.5)
s.plot(kind='barh'); # Use semi-colon to suppress Notebooks Out[] output
In [42]:
# First pass at plotting rent vs buy
# Get the subset of data to plot
sorsub = sor[['OwnerPctChg2012_2013','RenterPctChg2012_2013']]
# Desired order of columns, left to right (top to bottom): ,index=['>$150K', '$100K-$150K', '$75K-$100K', '$50K-$75K', '<$50K']
plt.figure();
sorsub.plot(kind='barh', title='High-income housing');
In [35]:
sorsub.index
Out[35]:
In [ ]: