In [1]:
from ambry import get_library
l = get_library()
b = l.bundle('cdph.ca.gov-hci-0.0.2')
First, create a set of views to limit the individual indicators to one record per county. The Ambry SQL parser is ver simplistic, and can't handle anything mroe then very simple joins.
In [2]:
w = b.warehouse('hci_counties')
w.clean()
print w.dsn
w.query("""
-- Get only counties in California
CREATE VIEW geo AS SELECT gvid, name AS county_name, geometry FROM census.gov-tiger-2015-counties
WHERE statefp = 6;
-- Get only records for all race/ethinicities
CREATE VIEW hf_total AS SELECT gvid, mrfei FROM cdph.ca.gov-hci-healthy_food-county
WHERE race_eth_name = 'Total';
-- Get only records for all race/ethinicities
CREATE VIEW aq_total AS SELECT gvid, pm25_concentration FROM cdph.ca.gov-hci-air_quality-county
WHERE race_eth_name = 'Total';
-- THe overty table has a lot of otrher categories, for report year and type of poverty
CREATE VIEW pr_total AS SELECT gvid, percent FROM cdph.ca.gov-hci-poverty_rate-county
WHERE race_eth_name = 'Total' AND reportyear='2008-2010' AND poverty='Overall';
""").close()
Now we can run a query to join the indicators.
In [3]:
sql="""
SELECT county_name, mrfei, pm25_concentration, percent as percent_poverty FROM geo as counties
JOIN hf_total ON hf_total.gvid = counties.gvid
JOIN aq_total ON aq_total.gvid = counties.gvid
JOIN pr_total ON pr_total.gvid = counties.gvid;
"""
df = w.dataframe(sql)
df.head()
Out[3]:
In [4]:
df.corr()
Out[4]:
Plot the PM2.5 Concentration, a measure of particulate air polution.
In [5]:
%matplotlib inline
sql="""
SELECT county_name, mrfei, pm25_concentration, percent as percent_poverty, geometry FROM geo as counties
LEFT JOIN hf_total ON hf_total.gvid = counties.gvid
LEFT JOIN aq_total ON aq_total.gvid = counties.gvid
LEFT JOIN pr_total ON pr_total.gvid = counties.gvid;
"""
w.geoframe(sql).plot(column='pm25_concentration')
Out[5]: