In [470]:
import pandas as pd
Exporting this entire dataset as a CSV provides a file named:
Production_Crops_E_All_Data.csv
the entire dataset is ~250MB
Grep can be used on Mac/Linux to quickly filter only features of interest:
$ grep "Yield" Production_Crops_E_All_Data.csv > YieldData.csv
the filtered dataset ~65MB
In [471]:
# Use Pandas to read the CSV file and store it in a DataFrame object
yieldData = pd.read_csv('YieldData.csv')
def countryNameMap(name):
if name == "Democratic People's Republic of Korea": return 'Dem. Rep. Korea'
elif name == 'Republic of Moldova': return 'Moldova'
elif name == 'China, mainland': return 'China'
elif name == 'The former Yugoslav Republic of Macedonia': return 'Macedonia'
elif name == 'China, Taiwan Province of': return 'Taiwan'
elif name == 'United States of America': return 'United States'
elif name == 'Iran (Islamic Republic of)': return 'Iran'
elif name == 'Ethiopia PDR': return 'Ethiopia'
elif name == 'Bolivia (Plurinational State of)': return 'Bolivia'
elif name == 'China, Hong Kong SAR': return 'China'
elif name == 'United States Virgin Islands': return 'United States'
elif name == 'USSR': return 'Russian Federation'
elif name == 'United Republic of Tanzania': return 'Tanzania'
elif name == 'China, Macao SAR': return 'China'
elif name == 'Venezuela (Bolivarian Republic of)': return 'Venezuela'
elif name == 'Sudan (former)': return 'Sudan'
elif name == 'Viet Nam': return 'Vietnam'
elif name == 'Gambia': return 'The Gambia'
elif name == 'Syrian Arab Republic': return 'Syria'
elif name == "Lao People's Democratic Republic": return 'Lao PDR'
elif name == 'Occupied Palestinian Territory': return 'Palestine'
else: return name
# Convert the country names listed above to match with the shapefile for plotting
yieldData.Country = yieldData.Country.map(countryNameMap)
In [472]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# Spoof the UserAgent string to appear as a typical browser
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = \
("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0")
# Ignore SSL errors and set a logfile path for troubleshooting
driver = webdriver.PhantomJS(desired_capabilities=dcap,
service_args=['--ignore-ssl-errors=true', '--ssl-protocol=tlsv1'],
service_log_path="/Users/kyledunn/phantom.log")
# Fake the window size to appear as a typical browser
driver.set_window_size(1440, 1024)
In [473]:
# Create a mapping for countries in the dataset that differ from the Wikipedia URL where it is found
def countryMap(name):
if name == 'Belgium-Luxembourg':
return "Luxembourg"
elif name == 'China, Hong Kong SAR':
return "Hong_Kong"
elif name == 'China, Macao SAR':
return "Macau"
elif name == 'China, Taiwan Province of':
return "Taiwan"
elif name == 'China, mainland':
return "China"
elif name == 'Congo':
return "Democratic_Republic_of_the_Congo"
elif name == 'Czechoslovakia':
return "Czech_Republic"
elif name == 'Ethiopia PDR':
return "Ethiopia"
elif name == 'Georgia':
return "Georgia_(country)"
elif name == 'Sudan (former)':
return "Sudan"
elif name == 'USSR':
return "Russia"
elif name == 'Uganda':
return "Kampala"
elif name == 'Yugoslav SFR':
return "Belgrade"
else:
return name
In [474]:
# Define a function to scrape wikipedia for coordinates of each country's capital
# the idea is to use this as an approximate country center
def getCapitalCoords(country):
# Lookup the correct URL suffix for this country and create the URL string
theUrl = 'http://en.wikipedia.org/wiki/{}'.format(countryMap(country))
# Fetch the webpage
driver.get(theUrl)
#print driver.get_log('browser')
# Place the webpage HTML into raw text
pageString = driver.page_source
# Parse the HTML text into a searchable BeautifulSoup object
soup = BeautifulSoup(pageString)
# Look for the particular DOM objects containing the lat/lon of the capital
try:
lat = soup.find('span', {'class': 'latitude'}).get_text()
lon = soup.find('span', {'class': 'longitude'}).get_text()
return dict(country=country, latitude=lat, longitude=lon)
# If the above fails, return NA's for the lat/lon
except:
return dict(country=country, latitude="NA", longitude="NA")
In [475]:
# Build up a dataframe of the capital coordinates
# by calling the previously defined function for
# every country in the yield dataset
"""
countryList = unique(yieldData.Country)
capitalCoords = pd.DataFrame()
for country in countryList:
# Skip aggregate features in the dataset
if "Total" in country:
continue
capitalCoords = capitalCoords.append(getCapitalCoords(country), ignore_index=True)
#print coords.shape[0]
""";
In [476]:
# Define a function to turn a lat/lon string
# into degrees, minutes, heading into +/-decimal representation
def coordStringToFloat(coordString):
degress = float()
minutes = 0.0
try:
if u'′' in unicode(coordString):
[degreesMinutes, northSouthEastWest] = coordString.split(u'′')
[degrees, minutes] = degreesMinutes.split(u'°')
else:
[degrees, northSouthEastWest] = coordString.split(u'°')
coordFloat = int(degrees) + float(minutes)/60.
northSouthEastWest = coordString[-1]
if northSouthEastWest in ['S','W']:
coordFloat = -coordFloat
return coordFloat
except:
# This gets taken when coordString == NA
return np.nan
# Apply the above function to the lat/lon fields obtained from Wikipedia
# and store the results into 'lat', and 'lon'
#captialCoords['lat'] = capitalCoords.latitude.map(coordStringToFloat)
#captialCoords['lon'] = captialCoords.longitude.map(coordStringToFloat)
In [477]:
# Pull in country centroids from cow dataset
# original source: http://www.opengeocode.org/download.php#cow
# N/S/E/W corrected source: http://www.zugzwang.org/geo/ogc/?limit=0
newCoords = pd.read_excel('countryCoords.xlsx')
# Apply the above function to the lat/lon fields obtained from Wikipedia
# and store the results into 'lat', and 'lon'
newCoords['lat'] = newCoords.Latitude.map(coordStringToFloat)
newCoords['lon'] = newCoords.Longitude.map(coordStringToFloat)
# Create an index on the country centroid dataframe with the country name
centerMap = newCoords.set_index(u'Name')
In [478]:
# Append the centroids to the existing yield data
yieldData = yieldData.merge(centerMap[['lat', 'lon']], left_on='Country', right_index=True)
In [479]:
# Subset the yield data, keeping only the fields of interest
yieldDataLess = yieldData[['Country', 'Year', 'lat', 'lon', 'Value']]
# Look at the first five rows in the subset
yieldDataLess.head()
Out[479]:
In [480]:
# Save off the pre-processed data subset for checkpointing
yieldData[['Country', 'Year', 'lat', 'lon', 'Value']].to_csv('countryGeoocatedYields.csv', encoding='utf-8')
In [481]:
# Take the average of every reported crop yield value (Hg/Ha)
yieldDataLess = yieldDataLess.groupby(['Country', 'Year', 'lat', 'lon']).agg(np.nanmean)
In [482]:
# Pivot the table, turning years into the columns
yieldDataLess = yieldDataLess.unstack('Year')
In [483]:
# Replace all NAN values with zeros - note, this is a source of error in z-scoring
#yieldDataLess = yieldDataLess.fillna(value=0.0)
In [484]:
#yieldDataLess
In [487]:
# Create a dataframe with the same indexes
zScores = pd.DataFrame(index=yieldDataLess.index)
# Define a function for computing z-scores for each year in the dataset, ignoring nan values
def setZScore(index):
for col in yieldDataLess['Value'].columns:
if np.isnan(yieldDataLess.loc[index]['Value'][col]):
zScores.loc[index, col] = np.nan
else:
zScores.loc[index, col] = (yieldDataLess.loc[index]['Value'][col] - np.nanmean(yieldDataLess['Value'][col]))\
/ np.nanstd(yieldDataLess['Value'][col], ddof=1)
# Apply the z-scoring function,
# which populates the zScore dataframe with results
zScores.index.map(setZScore);
In [488]:
zScores.index = zScores.index.droplevel(['lat', 'lon'])
#zScores
In [246]:
overallMin = yieldDataLess.Value.min()
overallMax = yieldDataLess.Value.max()
def doScatter(theData, theMap):
lats = theData.index.get_level_values('lat').values
lons = theData.index.get_level_values('lon').values
# Project raw Lat/Lon values into map coordinates
projX, projY = theMap(lons, lats)
# Create a hexbin heatmap
theScatter = theMap.scatter(projX, projY, s=50*np.log(theData))
# Remove it from the current plot
theScatter.remove()
return theScatter
from tempfile import NamedTemporaryFile
VIDEO_TAG = """<video controls>
<source src="data:video/x-m4v;base64,{0}" type="video/mp4">
Your browser does not support the video tag.
</video>"""
videoFile = "/Users/kyledunn/Dropbox/workspace/agriData/video.mp4"
def anim_to_html(anim):
if not hasattr(anim, '_encoded_video'):
#with theTempFile as f:
anim.save(videoFile, writer=animation.FFMpegFileWriter(), fps=2, extra_args=['-vcodec', 'libx264'])
video = open(videoFile, "rb").read()
anim._encoded_video = video.encode("base64")
return VIDEO_TAG.format(anim._encoded_video)
from IPython.display import HTML
def display_animation(anim):
plt.close(anim._fig)
return HTML(anim_to_html(anim))
from matplotlib import animation
from mpl_toolkits.basemap import Basemap, cm
# create figure and axes instances
fig = plt.figure(figsize=(20,15))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
theMap = Basemap(projection='kav7',lon_0=0,resolution='c')
binList = list()
for year in unique(yieldDataLess.columns):
datasubset = yieldDataLess[year]
binList.append(doScatter(datasubset, theMap))
# Draw coastlines, state and country boundaries
theMap.drawcoastlines()
theMap.drawcountries()
# Draw the parallels
#parallels = np.arange(0.,90,10.)
#theMap.drawparallels(parallels,labels=[1,0,0,0],fontsize=10)
# Draw meridians
#meridians = np.arange(180.,360.,10.)
#theMap.drawmeridians(meridians,labels=[0,0,0,1],fontsize=10)
# TODO FIXME
# Add the colorbar and label
#colorBar = theMap.colorbar(location='bottom',pad="5%")
#colorBar.set_label('Agriculture Yield [Hg/Ha]')
# Add the title
plt.title("Worldwide Crop Yields over time [UN Reported]")
# This is called sequentially (frame count is i)
def animate(i):
if i != 0:
ax.collections.remove(binList[i-1])
ax.collections.append(binList[i])
# Dummy init function prevents animate() from being called an extra time
# and throwing off the collection append/remove scheme
def init():
return
# call the animator. blit=True means only re-draw the parts that have changed.
anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(binList), blit=False)
# call our new function to display the animation
display_animation(anim)
Out[246]:
In [467]:
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import cartopy.io.shapereader as shpreader
import time
import numpy as np
VIDEO_TAG = """<video controls>
<source src="data:video/x-m4v;base64,{0}" type="video/mp4">
Your browser does not support the video tag.
</video>"""
videoFile = "/Users/kyledunn/Dropbox/workspace/agriData/video2.mp4"
def anim_to_html(anim, doShow=True):
if not hasattr(anim, '_encoded_video'):
#with theTempFile as f:
anim.save(videoFile, writer="ffmpeg", extra_args=['-vcodec', 'libx264'])
if doShow:
video = open(videoFile, "rb").read()
anim._encoded_video = video.encode("base64")
return VIDEO_TAG.format(anim._encoded_video)
else:
return "<p>Video saved to <a href='file://{vid}'>{vid}</a></p>".format(vid=videoFile)
from IPython.display import HTML
def display_animation(anim, doShow=True):
plt.close(anim._fig)
return HTML(anim_to_html(anim, doShow))
from matplotlib import animation
# some nice "earthy" colors
earth_colors = np.array([(199, 233, 192),
(161, 217, 155),
(116, 196, 118),
(65, 171, 93),
(35, 139, 69),
]) / 255.
earth_colors = itertools.cycle(earth_colors)
# Color map from D3 parcoords z-score rountine
myColorList = ["#DE5E60", "#4682b4", "#4682b4", "#98df8a"]
d3ZscoreCmap = matplotlib.colors.ListedColormap(myColorList, name='d3Zscore', N=18)
#fig = plt.figure(figsize=(16,9))
#ax = plt.axes(projection=ccrs.PlateCarree())
artists = dict()
def plotFor(year):
global fig
global ax
global artists
global zScores
ax.set_title("Worldwide Crop Yields {} [UN Reported]".format(year))
# Keep track of everything we add for this year
artists[year] = list()
minScore = min(zScores[year])
maxScore = max(zScores[year])
countryScoreRange = maxScore - minScore
for country in shpreader.Reader('ne_110m_admin_0_countries').records():
#print country.attributes['name_long'], earth_colors.next()
try:
countryScore = zScores.loc[country.attributes['name_long'], year]
artists[year].append(ax.add_geometries(country.geometry, ccrs.PlateCarree(),
facecolor=d3ZscoreCmap((countryScore + abs(minScore)) / float(countryScoreRange)),
label=country.attributes['name_long']))
except:
artists[year].append(ax.add_geometries(country.geometry, ccrs.PlateCarree(),
facecolor="grey",
label=country.attributes['name_long']))
"""facecolor=earth_colors.next(),""";
"""facecolor=d3ZscoreCmap(abs(countryScore) / float(countryScoreRange))""";
# Add colorbar, make sure to specify tick locations to match desired ticklabels
#cbar = fig.colorbar(ax, ticks=[0, abs(minScore), countryScoreRange])
#cbar.ax.set_yticklabels(['Below average', 'Average', 'Above Average'])# vertically oriented colorbar
return ax
# This is called sequentially (frame count is i)
def animate(i):
global artists
thisYear = min(zScores.columns)+i
if i != 0:
for a in artists[thisYear-1]: a.remove()
return plotFor(thisYear)
# Dummy init function prevents animate() from being called an extra time
# and throwing off the collection append/remove scheme
def init():
return #[ax]
# call the animator. blit=True means only re-draw the parts that have changed.
#anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(zScores.columns),
# interval=350, blit=False, save_count=len(zScores.columns))
# call our new function to display the animation
#display_animation(anim)
#plotFor(2013)
In [489]:
from mpl_toolkits.basemap import Basemap
from matplotlib.colors import rgb2hex
from matplotlib.patches import Polygon
from scipy.stats import norm
# TODO future work with mpld3 interactive basemap
import mpld3
# Create a figure and axes to plot on
f = figure(figsize=(16,9))
ax2 = plt.subplot(111)
# Use 'hot' colormap
chosenCmap = plt.cm.hot
# Create a basemap with a Kavrayskiy VII projection,
# centered at 0` longitude, and no geographical resolution
# (we add our own from shapefile)
m = Basemap(projection='kav7', lon_0=0, resolution=None)
# Fake a contour to in order to create a 'custom' colorbar
cont = contourf(np.array([[0, 0.5, 1], [0, 0, 0]]))
# Align the colorbar with our chosen cmap from above
cont.set_cmap(chosenCmap)
# Add colorbar, specify tick locations, set the labels for those locations and label the units
cbar = f.colorbar(cont, ticks=[0.5])
# Vertically oriented colorbar
cbar.ax.set_yticklabels(['Average'])
cbar.set_label('Relative Agriculture Yield to World Average')
# Load country info shapefile in current working directory
# and read it into the map instance (it handles coordinate projection automagically)
shapefile = 'ne_110m_admin_0_countries'
shp_info = m.readshapefile(shapefile, 'countries', drawbounds=True)
artists = dict()
collectionList = dict()
def myCdf(val, year):
return zScores[zScores[year] <= val][year].shape[0]/float(zScores[year].shape[0])
def plotFor(year):
global f
global ax2
global cbar
global artists
global collectionList
global zScores
ax2.set_title("Worldwide Crop Yields {} [UN FAO Reported]".format(year))
# Compute a CDF for mapping values in (0,1) range
# and use the resulting probabilities to create a
# lookup table by z-score
lookup = dict(zip(zScores[year], [myCdf(i, year) for i in zScores[year].values]))
# Temporary list to plot whole collection at once
collectionList[year] = list()
# Iterate through the shapedict, get the current country name, its yield z-score, set its color
for i, shapedict in enumerate(m.countries_info):
# Look up the country name
countryName = shapedict['name_long']
try:
# Choose a color for each country based on mean agricultural yield z-score
score = zScores.loc[countryName, year]
# Handle outliers consistently
if score > 2:
score = 1
elif score < -2:
score = 0
else:
# Map the score to a compressed
# range in 0.25-0.75, respecting relative distance (as best as possible)
score = lookup[score]/2 + 0.25
# Calling colormap with value between 0 and 1 returns
# rgba value. (Hot colors are above average yield)
rgbColor = chosenCmap( score )[:3]
hexColor = rgb2hex(rgbColor)
except:
# Default to black if we have no score
hexColor = "black"
# Look up this country's border vertices
borders = m.countries[i]
# Draw a polygon around the border, color it according to score
poly = Polygon(borders, facecolor=hexColor, edgecolor=hexColor)
# Append the polygon to the patch collection for this year
collectionList[year].append(poly)
# Create a patch collection and add it to the axes, respecting colors set per patch
collection = matplotlib.collections.PatchCollection(collectionList[year], match_original=True)
artists[year] = ax2.add_collection(collection)
# This is called sequentially (frame count is i)
def animate(i):
global artists
thisYear = min(zScores.columns)+i
if i != 0:
artists[thisYear-1].remove()
return plotFor(thisYear)
# Dummy init function prevents animate() from being called an extra time
# and throwing off the collection append/remove scheme
def init():
return
# call the animator. blit=True means only re-draw the parts that have changed.
anim = animation.FuncAnimation(f, animate, init_func=init, frames=len(zScores.columns),
interval=350, blit=False, save_count=len(zScores.columns))
# call our new function to display the animation
display_animation(anim, doShow=False)
Out[489]:
In [490]:
from mpld3 import plugins, fig_to_html
#plugins.connect(f, plugins.LineLabelTooltip(lines[0]))
#plugins.PointLabelTooltip
tooltip = plugins.PointLabelTooltip(collection, labels=[str(i) for i in range(1,len(patchList)+1)])
plugins.connect(f, tooltip)
HTML(fig_to_html(f))
Out[490]:
In [ ]: