This notebook extracts precipitation timeseries for in districts in Uganda from NASA GPM satalite data
In [1]:
import pandas as pd
import geopandas as gp
from rasterstats import zonal_stats
from affine import Affine
import matplotlib as mpl
import matplotlib.pyplot as plt
import mpl_toolkits
from mpl_toolkits.basemap import Basemap
import numpy as np
import h5py
from osgeo import gdal
from descartes import PolygonPatch
import datetime
from __future__ import print_function
pd.set_option('io.hdf.default_format', 'table')
%matplotlib inline
In [2]:
from shapely.geometry.polygon import Polygon
def small_triangle(x, y):
"""
Make a small triangle at the given point.
"""
return Polygon([[x, y], [x, y + .001], [x + .001, y]])
# Load district shape files.
# TODO Clip Lake Victoria?
uganda_shapes = gp.GeoDataFrame.from_file("/home/nathan/GPM-district-precipitation/District_Boundaries_2014.shp")
# Add a weather stations for comparison with on the ground data.
uganda_shapes = uganda_shapes.append(dict(
DNAME2014="HUEN Weather Station",
geometry=small_triangle(32.444, 0.042)
), ignore_index=True)
uganda_shapes = uganda_shapes.append(dict(
DNAME2014="Kitale Weather Station",
geometry=small_triangle(35, 1.016)
), ignore_index=True)
uganda_shapes.plot(column='POP_2014', scheme='QUANTILES', k=3, colormap='OrRd')
def capitalize(s):
return " ".join(
map(lambda k: k[0].upper() + k[1:], s.lower().split(' ')))
uganda_shapes['district'] = uganda_shapes.DNAME2014.map(capitalize)
uganda_shapes.sort('district')
Out[2]:
In [3]:
import os
import os.path
import copy
import ftplib
import tempfile
username = os.environ.get('NASA_USERNAME')
password = os.environ.get('NASA_PASSWORD')
def get_total_precipitation(init_date, end_date):
"""
Get a numpy raster map of the total precipitation over the given date range in millimeters.
"""
file_name_pattern = ('/NRTPUB/imerg/early/%(year)4d%(month)02d/' +
'3B-HHR-E.MS.MRG.3IMERG.%(year)4d%(month)02d%(day)02d-' +
'S%(hour)02d%(minute)02d00-E%(end_hour)02d%(end_minute)02d59.%(minute_of_day)04d.V03E.RT-H5'
)
precip_data = None
iterval_minutes = 30
count = 0
start_date = copy.deepcopy(init_date)
while start_date <= end_date:
file_name = file_name_pattern % dict(
year=start_date.year,
month=start_date.month,
day=start_date.day,
hour=start_date.hour,
minute=start_date.minute,
end_hour=(start_date + datetime.timedelta(minutes=iterval_minutes - 1)).hour,
end_minute=(start_date + datetime.timedelta(minutes=iterval_minutes - 1)).minute,
minute_of_day=(
start_date - datetime.datetime(start_date.year, start_date.month, start_date.day)
).total_seconds() / 60
)
start_date += datetime.timedelta(minutes=iterval_minutes)
try:
# Deleting on close causes problems because the file
# cannot be read by hd5py if it is not closed first.
my_tempfile = tempfile.NamedTemporaryFile(delete=False)
ftp = ftplib.FTP("jsimpson.pps.eosdis.nasa.gov")
ftp.login(username, password)
ftp.retrbinary('RETR ' + file_name, my_tempfile.write)
my_tempfile.close()
with h5py.File(my_tempfile.name, 'r') as h5_file:
d = np.array(h5_file['Grid/precipitationCal'])
d = np.clip(d, 0, np.inf)
if precip_data is None:
precip_data = d
else:
precip_data += d
count += 1
except ftplib.error_perm as e:
if str(e).startswith('550'):
print("WARNING: File does not exist:")
print(file_name, os.path.isfile(file_name))
# When a file is missing, we assume the average rainfall for the rest of the
# time iterval occured during its time period.
# Usually a missing file is a result of choosing an interval that
# extends beyond the downloaded data's date range, however there are
# holes in the data. For example, there is missing data on 2016-3-12 from 12 to 3pm.
continue
else:
raise e
finally:
os.remove(my_tempfile.name)
#in mm/hr
average_precipitation_rate = precip_data / count
date_range_hours = (end_date - init_date).total_seconds() / (60 * 60)
return average_precipitation_rate * date_range_hours
In [7]:
# An arbitrary file chosen to get coordinates for projection
file_name = "/NRTPUB/imerg/early/201611/3B-HHR-E.MS.MRG.3IMERG.20161121-S070000-E072959.0420.V03E.RT-H5"
my_tempfile = tempfile.NamedTemporaryFile(delete=False)
ftp = ftplib.FTP("jsimpson.pps.eosdis.nasa.gov")
ftp.login(username, password)
ftp.retrbinary('RETR ' + file_name, my_tempfile.write)
my_tempfile.close()
with h5py.File(my_tempfile.name, 'r') as h5_file:
print("Keys:")
for key in h5_file['Grid'].keys():
print(" ", key)
lons = np.array(h5_file['Grid/lon'])
lats = np.array(h5_file['Grid/lat'])
os.remove(my_tempfile.name)
xmin, ymin, xmax, ymax = [lons.min(), lats.min(), lons.max(), lats.max()]
nrows, ncols = np.shape(data.transpose())
xres = (xmax - xmin) / float(ncols)
yres = (ymax - ymin) / float(nrows)
xy_to_raster_affine = Affine.from_gdal(xmin, xres, 0, ymax, 0, -yres)
In [8]:
# TODO: Init series
timeseries = pd.read_pickle('GPM-timeseries-by-district-uganda.p')
# Create a time series of precipitation by district with ticks for each day.
for day in pd.date_range(timeseries.date.max(), pd.Timestamp.now(), freq='D'):
data = get_total_precipitation(
pd.to_datetime(day),
pd.to_datetime(day) + datetime.timedelta(1))
print(day, ", ", end='')
zstats = zonal_stats(uganda_shapes,
data.transpose()[::-1],
affine=xy_to_raster_affine,
all_touched=True)
for idx, row in uganda_shapes.iterrows():
zstats[idx]['district'] = row.district
zstats[idx]['date'] = day
zstats_frame = pd.DataFrame(zstats)
# TODO: Add min and max
zstats_frame = zstats_frame.rename(columns={
'mean': 'district_mean_rainfall_mm'
}).drop(['min', 'max', 'count'], axis=1)
timeseries = timeseries.append(zstats_frame)
# Add computed columns, and save pickled version.
timeseries['month'] = timeseries.date.map(lambda x: pd.datetime(x.year, x.month, 1))
timeseries['week'] = timeseries.date.map(lambda x: pd.datetime(x.year, 1, 1) + pd.Timedelta(
7 * int(
(x - pd.datetime(x.year, 1, 1)).total_seconds() /
(60 * 60 * 24 * 7)), 'D'))
# Arbitrary threshold
timeseries['rainy_day'] = timeseries['district_mean_rainfall_mm'] > 1.0
timeseries.to_pickle('GPM-timeseries-by-district-uganda.p.bup')
timeseries.to_pickle('GPM-timeseries-by-district-uganda.p')
In [65]:
# Test get_total_precipitation by comparing its output with the movie here:
# http://svs.gsfc.nasa.gov/cgi-bin/details.cgi?aid=4285
from PIL import Image, ImageDraw
def exp_normalize(arr, exp=1.0):
"""
Convert an array to zero to one values, where one corresponds to the max value,
then apply an exponential function to adjust the curve of the color ramp.
"""
return (arr / arr.max()) ** exp
data = get_total_precipitation(
datetime.datetime(2016, 11, 29),
datetime.datetime(2016, 11, 29, 1))
im_array = np.array(exp_normalize(data.transpose()[::-1], .25) * 2 ** 8,
dtype=np.uint8)
Image.fromarray(im_array, 'L')
Out[65]:
In [68]:
def plot_uganda_district_rainfall(raster_data):
m = Basemap(projection='cyl',
llcrnrlon=25, llcrnrlat=-5,
urcrnrlon=38, urcrnrlat=10,
resolution='l')
fig = plt.figure(figsize=(10,10))
# The lat/lons for pcolormesh need to be for the top-left corner
# of the grid cell.
dlon = (lons[-1]-lons[0]) / len(lons[1:])
dlat = (lats[-1]-lats[0]) / len(lats[1:])
lons2, lats2 = np.meshgrid(
np.arange(lons[0], lons[-1] + dlon, dlon) - (dlon / 2),
np.arange(lats[0], lats[-1] + dlat, dlat) - (dlat / 2))
m.pcolormesh(lons2, lats2, exp_normalize(raster_data, .1).transpose(), cmap=plt.cm.Blues, latlon=True)
ax = plt.gca()
patches = []
#add polygons
zstats = zonal_stats(uganda_shapes,
raster_data.transpose()[::-1],
affine=xy_to_raster_affine,
all_touched=True)
max_mean = max([stat['mean'] for stat in zstats])
for idx, dist in uganda_shapes.iterrows():
poly = dist.geometry
stats = zstats[idx]
if poly.geom_type == 'Polygon':
p = PolygonPatch(poly, facecolor=plt.cm.Reds((stats['mean'] / max_mean) ** .1), alpha=.4)
patches.append(p)
ax.add_collection(mpl.collections.PatchCollection(patches, match_original=True))
m.drawcoastlines()
m.drawparallels(np.arange(-80.,80.,5.))
m.drawmeridians(np.arange(-180.,180.,5.))
return m
# Verify that plot_uganda_district_rainfall is accurate by using zonal_stats's underlying functions
# to draw the Uganda district shapes on a raster then plot it.
from rasterio import features
from rasterstats.io import read_features
from shapely.geometry import shape
rv_array = features.rasterize(
[shape(feature['geometry']) for feature in read_features(uganda_shapes)],
out_shape=data.transpose().shape,
transform=xy_to_raster_affine,
fill=255,
dtype='uint8',
all_touched=True)
plot_uganda_district_rainfall(np.array(~rv_array, dtype=np.float32)[::-1].transpose() / 255)
Out[68]:
In [69]:
# Check that the pixel at the lat/lng for the Kitale Weather Station is filled
~rv_array[::-1][(lats >= 1) & (lats <= 1.2), :][:, (lons >= 35) & (lons <= 35.2)]
Out[69]:
In [70]:
# Test zone stats by sumperimposing uganda district choropleth onto rainfall raster.
# The shade of each district should correspond to the number and intensity of the
# blue rainfall pixels overlapping it.
data = get_total_precipitation(datetime.datetime(2016, 11, 29), datetime.datetime(2016, 11, 29,1))
print(plot_uganda_district_rainfall(data))
data2 = get_total_precipitation(datetime.datetime(2016, 10, 2), datetime.datetime(2016, 10, 2, 1))
print(plot_uganda_district_rainfall(data2))
In [71]:
# Highlight the basemap region on the original raster image to verify its alignment
data = get_total_precipitation(
datetime.datetime(2016, 11, 29),
datetime.datetime(2016, 11, 29, 1))
im_array = np.array((data / data.max()) ** .25 * 2 ** 8,
dtype=np.uint8)
im_array[:, (lats > -5) & (lats < 10)] ^= 255
im_array[(lons > 25) & (lons < 38), :] ^= 255
Image.fromarray(im_array.transpose()[::-1], 'L')
Out[71]:
In [13]:
# Monthly rainfall comparison for Wakiso districts.
d = pd.DataFrame(timeseries)
d = d[d.district.str.startswith('Wakiso')]
d.groupby(['district', 'month']).sum().reset_index().pivot(
index='month', columns='district', values='district_mean_rainfall_mm'
).plot(kind='bar', figsize=(12, 8))
Out[13]:
In [14]:
# Number of rainy days per month in Wakiso districts.
d = pd.DataFrame(timeseries)
d = d[d.district.str.startswith('Wakiso')]
d.groupby(['district', 'month']).sum().reset_index().pivot(
index='month', columns='district', values='rainy_day'
).plot(kind='bar', figsize=(12, 8))
Out[14]:
In [18]:
# Daily rainfall over HUEN Weather station region.
d = pd.DataFrame(timeseries)
d = d[d.district.str.startswith('Huen')]
d.groupby(['district', 'date']).sum().reset_index().pivot(
index='date', columns='district', values='district_mean_rainfall_mm'
).plot(figsize=(12, 8))
Out[18]:
In [19]:
# Use data from HUEN weather station in Entebbe to cross-check satalite data
# Data source: https://www.wunderground.com/history/airport/HUEN
weather_station_data = pd.read_csv("HUEN.csv", converters=dict(
EAT=pd.to_datetime
)).set_index("EAT")
weather_station_data['month'] = weather_station_data.index.map(lambda x: "%4d-%02d" % (x.year, x.month))
weather_station_data['precipitationMm'] = weather_station_data.PrecipitationIn * 25.4
weather_station_data[['precipitationMm']].plot(kind='line', figsize=(12, 8))
Out[19]:
In [20]:
# Montly precipitation from HUEN station
d = pd.DataFrame(weather_station_data)
d.groupby(['month']).sum()['precipitationMm'].plot(kind='bar', figsize=(12, 8))
Out[20]:
The HUEN data shows quite a bit less precipitation than the GPM satellite, and many of the peaks do not match up.
This website shows Entebbe's average yearly rainfall. It is closer the the satellite data than the HUEN data in magnitude: https://weather-and-climate.com/average-monthly-Rainfall-Temperature-Sunshine,entebbe,Uganda
The wolfram alpha data for Entebbe precipitation does not appear to match up perfectly with the satellite data. It mentions the HUEN weather station. It is closer in magnitude to the satellite data than the weather station data: http://www.wolframalpha.com/input/?i=entebbe+rainfall+by+month
In [25]:
# Compare NOAA data for Kitale Weather station with satellite data
# source: https://www.ncdc.noaa.gov/cdo-web/orders?email=breit@ecohealthalliance.org&id=855687
kitale_station_data = pd.read_csv("kitale-weather-station.csv", converters=dict(
DATE=pd.to_datetime)).set_index('DATE')
kitale_station_data['month'] = kitale_station_data.index.map(lambda x: pd.datetime(x.year, x.month, 1))
kitale_station_data['NOAA_precipitationMm'] = kitale_station_data.PRCP * 25.4
d = pd.DataFrame(timeseries)
d = d[d.district.str.startswith('Kitale Weather Station')]
d = d.groupby(['district', 'date']).sum().reset_index().pivot(
index='date', columns='district', values='district_mean_rainfall_mm'
)
# Shift forward one day so measurement timespans overlap more.
# I don't know the exact time ranges of meaurement for the Kitale data,
# so I shifted the data based on what provides the maximum correlation.
kitale_station_data.index = kitale_station_data.index - pd.Timedelta("1 day")
d = d.join(kitale_station_data[['NOAA_precipitationMm']])
d.fillna(-1)[d.index > pd.datetime(2016,1,1)].plot(figsize=(12, 8))
Out[25]:
Although there appears to be a large amount of disagreement between the datasets, I believe they pertain to the same region because the correlation coefficient is generally much smaller when using a different locations or shifting the timeranges.
In [26]:
d = pd.DataFrame(timeseries)
# Wakiso is arbitrarily chosen for comparison of correlation coefficients
d = d[d.district.str.startswith('Kitale Weather Station') | d.district.str.startswith('Wakiso')]
d = d.groupby(['district', 'date']).sum().reset_index().pivot(
index='date', columns='district', values='district_mean_rainfall_mm'
)
d = d.join(kitale_station_data[['NOAA_precipitationMm']])
d[~pd.isnull(d.NOAA_precipitationMm)].corr()
Out[26]:
In [31]:
d = pd.DataFrame(timeseries)
req_date = "2016-11-2"
d = d[d.week == d.query("week < '%s'" % req_date).week.max()]
d = d.groupby(['district', 'week']).sum().reset_index()
with open("uganda-district-precipitation-2016-11.json", "w") as f:
s = gp.GeoDataFrame(uganda_shapes)
s.geometry = s.geometry.simplify(.002)
s = s.merge(d, on="district")\
.drop(['week'], 1)
s['fill-color'] = '#0000FF'
s['fill-opacity'] = s.district_mean_rainfall_mm / s.district_mean_rainfall_mm.max()
f.write(s.to_json())
https://github.com/geopandas/geopandas/blob/master/examples/choropleths.ipynb https://github.com/davidbrochart/gross/blob/8c1111b87b87b57fa3462abb1a19bb95d94ccac1/ipynb/amazon1.ipynb https://github.com/JacksonTanBS/iPythonNotebooks/blob/master/150528%20How%20Much%20of%20Earth%20is%20Raining%20at%20Any%20One%20Time.ipynb http://earthpy.org/pandas-basics.html https://borealperspectives.wordpress.com/2016/03/07/plotting-polygon-shapefiles-on-a-matplotlib-basemap-with-geopandas-shapely-and-descartes/