In [1]:
import requests
from datetime import date
from calendar import monthrange
import os.path
In [2]:
# https://files.airnowtech.org/airnow/2016/20161101/peak_aqi_usa.jpg
image_url_base = "https://files.airnowtech.org/airnow/{}/{}/peak_aqi_usa.jpg"
# note that there's also an API at https://docs.airnowapi.org/
# but they limit their endpoints to 500/day
# the right way to do this is to use their download api
# for example, in https://files.airnowtech.org/?prefix=airnow/2016/20161108/
# join monitoring_site_locations.dat and daily_data.dat
In [3]:
def download_aqi_images(numdays, directory):
s = requests.session()
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, numdays)]
for date in date_list:
ymd = "{}{:02}{:02}".format(date.year, date.month, date.day)
download_file = "{}/{}.jpg".format(directory, ymd)
if os.path.isfile(download_file):
continue
url = image_url_base.format(date.year, ymd)
r = s.get(url, stream=True)
if r.status_code == 200:
with open(download_file, 'wb') as f:
for chunk in r:
f.write(chunk)
In [4]:
from sklearn.cluster import KMeans
import numpy as np
In [5]:
from scipy import misc
from matplotlib import pyplot as plt
import seaborn
%matplotlib inline
In [6]:
img = misc.imread("pm25_images/20161113.jpg")
plt.imshow(img)
Out[6]:
In [7]:
type(img)
Out[7]:
In [8]:
img.shape, img.dtype
Out[8]:
In [9]:
img = img.reshape((img.shape[0] * img.shape[1], 3))
img.shape, img.dtype
Out[9]:
In [10]:
clusters = 10
clt = KMeans(n_clusters = clusters)
clt.fit(img)
Out[10]:
In [11]:
# from http://www.pyimagesearch.com/2014/05/26/opencv-python-k-means-color-clustering/
def centroid_histogram(clt):
# grab the number of different clusters and create a histogram
# based on the number of pixels assigned to each cluster
numLabels = np.arange(0, len(np.unique(clt.labels_)) + 1)
(hist, _) = np.histogram(clt.labels_, bins = numLabels)
# normalize the histogram, such that it sums to one
hist = hist.astype("float")
hist /= hist.sum()
# return the histogram
return hist
In [12]:
cluster_histogram = centroid_histogram(clt)
In [13]:
colors = [[x/255 for x in center] for center in clt.cluster_centers_]
colors
Out[13]:
In [14]:
plt.bar(np.arange(len(cluster_histogram)), cluster_histogram, color=colors)
Out[14]:
In [15]:
import yaml
import datetime
In [16]:
with open("config.yml", 'r') as stream:
try:
config = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
In [17]:
# http://www.airnowapi.org/aq/observation/zipCode/historical/?format=application/json&zipCode=43085&date=2016-11-08T00-0000&distance=25&API_KEY=???
airnow_api_base_url = "http://www.airnowapi.org/aq/observation/zipCode/historical/?format=application/json&zipCode={}&date={}&distance=25&API_KEY={}"
In [18]:
# i've hit my rate limit for the day (only 500 queries!)
# but this should work
def fetch_aqis_for_zipcode(zipcode, numdays, apikey):
s = requests.session()
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, numdays)]
aqis = {}
for date in date_list:
ymd = "{}-{:02}-{:02}T00-0000".format(date.year, date.month, date.day)
url = airnow_api_base_url.format(zipcode, ymd, apikey)
r = s.get(url)
if r.status_code == 200:
for report in r.json():
if report['ParameterName'] == 'PM2.5':
aqis[date] = report['AQI']
return aqis
In [19]:
aqis = fetch_aqis_for_zipcode(33601, 100, config["airnow_api_key"])
In [20]:
import matplotlib
fig, ax = plt.subplots(1)
dates = sorted(aqis.keys())
aqi_values = [aqis[date] for date in dates]
ax.plot(dates, aqi_values, label="Tampa")
# rotate and align the tick labels so they look better
fig.autofmt_xdate()
# use a more precise date string for the x axis locations in the
# toolbar
import matplotlib.dates as mdates
myFmt = mdates.DateFormatter('%m/%d')
ax.xaxis.set_major_formatter(myFmt)
plt.legend()
Out[20]:
In [21]:
import pandas as pd
In [22]:
# https://s3-us-west-1.amazonaws.com//files.airnowtech.org/airnow/2016/20161108/daily_data.dat
# or monitoring_site_locations.dat
airnow_file_baseurl = "https://s3-us-west-1.amazonaws.com//files.airnowtech.org/airnow/{}/{}/{}"
In [23]:
date = datetime.datetime.today()
In [24]:
def download_daily_data_and_monitoring(date, directory):
s = requests.session()
for file in ['daily_data.dat', 'monitoring_site_locations.dat']:
ymd = "{}{:02}{:02}".format(date.year, date.month, date.day)
download_file = "{}/{}-{}".format(directory, ymd, file)
if os.path.isfile(download_file):
continue
url = airnow_file_baseurl.format(date.year, ymd, file)
r = s.get(url, stream=True)
if r.status_code == 200:
with open(download_file, 'wb') as f:
for chunk in r:
f.write(chunk)
else:
print(r.status_code)
In [25]:
download_daily_data_and_monitoring(datetime.datetime(2016, 11, 8, 00, 00), "pm25_files")
In [26]:
observations = pd.read_csv("pm25_files/20161109-daily_data.dat",
sep="|",
encoding = "ISO-8859-1",
header=None,
names=["Valid date", "AQSID", "Sitename", "Parameter name", "Reporting units", "Value", "Averaging period", "Data Source"],
)
In [27]:
loc_field_names = ["AQSID", "Parameter name", "Site code", "Sitename",
"Status", "Agency ID", "Agency name", "EPA region",
"Latitude", "Longitude", "Elevation", "GMT offset",
"Country code", "CMSA code", "CMSA name", "MSA code",
"MSA name", "State code", "State name", "County code",
"County name", "City code", "Nada"]
# we don't want every field here
locs = pd.read_csv("pm25_files/20161109-monitoring_site_locations.dat",
sep="|",
encoding = "ISO-8859-1",
header=None,
names=loc_field_names,
usecols=["AQSID", "Longitude", "Latitude", "State name", "County name"]
)
# each site id will have multiple rows based on multiple parameter sensors
locs = locs.drop_duplicates()
locs.head()
Out[27]:
In [28]:
locs = locs.set_index(["AQSID"])
locs.head()
Out[28]:
In [29]:
# we only care about pm2.5
observations = observations[observations["Parameter name"] == "PM2.5-24hr"]
observations = observations.set_index(["AQSID"])
observations.head()
Out[29]:
In [30]:
joined = observations.join(locs)
In [31]:
ohio = joined[joined["State name"] == "OH"]
In [32]:
ohio
Out[32]:
In [33]:
# download the last 100 days of files
numdays=100
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, numdays)]
for date in date_list:
print("downloading file for {}".format(date))
download_daily_data_and_monitoring(date, "pm25_files")
In [34]:
# meanwhile, let's do some desciptive stats of some of our data
joined[pd.notnull(joined["State name"])].Value.hist(bins=50)
Out[34]:
In [35]:
joined[pd.notnull(joined["State name"])].describe()
Out[35]:
In [36]:
ohio.describe()
Out[36]:
In [37]:
joined[joined.Value > 51]
Out[37]:
In [ ]: