Get started by importing the Pandas library


In [470]:
import pandas as pd

Data source : http://faostat3.fao.org/download/Q/QC/E

Exporting this entire dataset as a CSV provides a file named:
Production_Crops_E_All_Data.csv

the entire dataset is ~250MB

Grep can be used on Mac/Linux to quickly filter only features of interest:
$ grep "Yield" Production_Crops_E_All_Data.csv > YieldData.csv

the filtered dataset ~65MB


In [471]:
# Use Pandas to read the CSV file and store it in a DataFrame object
yieldData = pd.read_csv('YieldData.csv')

def countryNameMap(name):
    if name == "Democratic People's Republic of Korea": return 'Dem. Rep. Korea'
    elif name == 'Republic of Moldova': return 'Moldova'
    elif name == 'China, mainland': return 'China'
    elif name == 'The former Yugoslav Republic of Macedonia': return 'Macedonia'
    elif name == 'China, Taiwan Province of': return 'Taiwan'
    elif name == 'United States of America': return 'United States'
    elif name == 'Iran (Islamic Republic of)': return 'Iran'
    elif name == 'Ethiopia PDR': return 'Ethiopia'
    elif name == 'Bolivia (Plurinational State of)': return 'Bolivia'
    elif name == 'China, Hong Kong SAR': return 'China'
    elif name == 'United States Virgin Islands': return 'United States'
    elif name == 'USSR': return 'Russian Federation'
    elif name == 'United Republic of Tanzania': return 'Tanzania'
    elif name == 'China, Macao SAR': return 'China'
    elif name == 'Venezuela (Bolivarian Republic of)': return 'Venezuela'
    elif name == 'Sudan (former)': return 'Sudan'
    elif name == 'Viet Nam': return 'Vietnam'
    elif name == 'Gambia': return 'The Gambia'
    elif name == 'Syrian Arab Republic': return 'Syria'
    elif name == "Lao People's Democratic Republic": return 'Lao PDR'
    elif name == 'Occupied Palestinian Territory': return 'Palestine'
    else: return name

# Convert the country names listed above to match with the shapefile for plotting
yieldData.Country = yieldData.Country.map(countryNameMap)

Selenium/Beautiful Soup Wikipedia Scraping


In [472]:
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

# Spoof the UserAgent string to appear as a typical browser
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = \
        ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0")

# Ignore SSL errors and set a logfile path for troubleshooting
driver = webdriver.PhantomJS(desired_capabilities=dcap, 
                             service_args=['--ignore-ssl-errors=true', '--ssl-protocol=tlsv1'],
                             service_log_path="/Users/kyledunn/phantom.log")

# Fake the window size to appear as a typical browser
driver.set_window_size(1440, 1024)

In [473]:
# Create a mapping for countries in the dataset that differ from the Wikipedia URL where it is found
def countryMap(name):
    if name == 'Belgium-Luxembourg':
        return "Luxembourg"
    elif name == 'China, Hong Kong SAR':
        return "Hong_Kong"
    elif name == 'China, Macao SAR':
        return "Macau"
    elif name == 'China, Taiwan Province of':
        return "Taiwan"
    elif name == 'China, mainland':
        return "China"
    elif name == 'Congo':
        return "Democratic_Republic_of_the_Congo"
    elif name == 'Czechoslovakia':
        return "Czech_Republic"
    elif name == 'Ethiopia PDR':
        return "Ethiopia"
    elif name == 'Georgia':
        return "Georgia_(country)"
    elif name == 'Sudan (former)':
        return "Sudan"
    elif name == 'USSR':
        return "Russia"
    elif name == 'Uganda':
        return "Kampala"
    elif name == 'Yugoslav SFR':
        return "Belgrade"
    else:
        return name

In [474]:
# Define a function to scrape wikipedia for coordinates of each country's capital
# the idea is to use this as an approximate country center

def getCapitalCoords(country):
    # Lookup the correct URL suffix for this country and create the URL string
    theUrl = 'http://en.wikipedia.org/wiki/{}'.format(countryMap(country))
    
    # Fetch the webpage
    driver.get(theUrl)
    
    #print driver.get_log('browser')

    # Place the webpage HTML into raw text
    pageString = driver.page_source

    # Parse the HTML text into a searchable BeautifulSoup object
    soup = BeautifulSoup(pageString)
    
    # Look for the particular DOM objects containing the lat/lon of the capital
    try:
        lat = soup.find('span', {'class': 'latitude'}).get_text()
        lon = soup.find('span', {'class': 'longitude'}).get_text()
        
        return dict(country=country, latitude=lat, longitude=lon)
    # If the above fails, return NA's for the lat/lon
    except:
        return dict(country=country, latitude="NA", longitude="NA")

In [475]:
# Build up a dataframe of the capital coordinates
# by calling the previously defined function for
# every country in the yield dataset
"""
countryList = unique(yieldData.Country)

capitalCoords = pd.DataFrame()
for country in countryList:
    # Skip aggregate features in the dataset
    if "Total" in country:
        continue
    capitalCoords = capitalCoords.append(getCapitalCoords(country), ignore_index=True)
    #print coords.shape[0]
""";

In [476]:
# Define a function to turn a lat/lon string 
# into degrees, minutes, heading into +/-decimal representation
def coordStringToFloat(coordString):
    degress = float()
    minutes = 0.0
    
    try:
        if u'′' in unicode(coordString):
            [degreesMinutes, northSouthEastWest] = coordString.split(u'′')
            [degrees, minutes] = degreesMinutes.split(u'°')
        else:
            [degrees, northSouthEastWest] = coordString.split(u'°')

        coordFloat = int(degrees) + float(minutes)/60.

        northSouthEastWest = coordString[-1]
        if northSouthEastWest in ['S','W']:
            coordFloat = -coordFloat

        return coordFloat
    except:
        # This gets taken when coordString == NA
        return np.nan

# Apply the above function to the lat/lon fields obtained from Wikipedia
# and store the results into 'lat', and 'lon'
#captialCoords['lat'] = capitalCoords.latitude.map(coordStringToFloat)
#captialCoords['lon'] = captialCoords.longitude.map(coordStringToFloat)

In [477]:
# Pull in country centroids from cow dataset
# original source: http://www.opengeocode.org/download.php#cow
# N/S/E/W corrected source: http://www.zugzwang.org/geo/ogc/?limit=0

newCoords = pd.read_excel('countryCoords.xlsx')

# Apply the above function to the lat/lon fields obtained from Wikipedia
# and store the results into 'lat', and 'lon'
newCoords['lat'] = newCoords.Latitude.map(coordStringToFloat)
newCoords['lon'] = newCoords.Longitude.map(coordStringToFloat)

# Create an index on the country centroid dataframe with the country name
centerMap = newCoords.set_index(u'Name')

In [478]:
# Append the centroids to the existing yield data
yieldData = yieldData.merge(centerMap[['lat', 'lon']], left_on='Country', right_index=True)

In [479]:
# Subset the yield data, keeping only the fields of interest
yieldDataLess = yieldData[['Country', 'Year', 'lat', 'lon', 'Value']]

# Look at the first five rows in the subset
yieldDataLess.head()


Out[479]:
Country Year lat lon Value
0 Afghanistan 1976 33 65 16610.2
1 Afghanistan 1977 33 65 15000.0
2 Afghanistan 1978 33 65 20000.0
3 Afghanistan 1979 33 65 17500.0
4 Afghanistan 1980 33 65 17069.0

In [480]:
# Save off the pre-processed data subset for checkpointing
yieldData[['Country', 'Year', 'lat', 'lon', 'Value']].to_csv('countryGeoocatedYields.csv', encoding='utf-8')

In [481]:
# Take the average of every reported crop yield value (Hg/Ha)
yieldDataLess = yieldDataLess.groupby(['Country', 'Year', 'lat', 'lon']).agg(np.nanmean)

In [482]:
# Pivot the table, turning years into the columns
yieldDataLess = yieldDataLess.unstack('Year')

In [483]:
# Replace all NAN values with zeros - note, this is a source of error in z-scoring
#yieldDataLess = yieldDataLess.fillna(value=0.0)

In [484]:
#yieldDataLess

In [487]:
# Create a dataframe with the same indexes
zScores = pd.DataFrame(index=yieldDataLess.index)

# Define a function for computing z-scores for each year in the dataset, ignoring nan values
def setZScore(index):
    for col in yieldDataLess['Value'].columns:
        if np.isnan(yieldDataLess.loc[index]['Value'][col]):
            zScores.loc[index, col] = np.nan
        else:
            zScores.loc[index, col] = (yieldDataLess.loc[index]['Value'][col] - np.nanmean(yieldDataLess['Value'][col]))\
                                                / np.nanstd(yieldDataLess['Value'][col], ddof=1)

# Apply the z-scoring function, 
# which populates the zScore dataframe with results
zScores.index.map(setZScore);

In [488]:
zScores.index = zScores.index.droplevel(['lat', 'lon'])
#zScores

Animated scatter plot using yield averages for each country


In [246]:
overallMin = yieldDataLess.Value.min()
overallMax = yieldDataLess.Value.max()

def doScatter(theData, theMap):
    lats = theData.index.get_level_values('lat').values
    lons = theData.index.get_level_values('lon').values
    
    # Project raw Lat/Lon values into map coordinates
    projX, projY = theMap(lons, lats)

    # Create a hexbin heatmap
    theScatter = theMap.scatter(projX, projY, s=50*np.log(theData))

    # Remove it from the current plot
    theScatter.remove()
    
    return theScatter
   

from tempfile import NamedTemporaryFile

VIDEO_TAG = """<video controls>
 <source src="data:video/x-m4v;base64,{0}" type="video/mp4">
 Your browser does not support the video tag.
</video>"""

videoFile = "/Users/kyledunn/Dropbox/workspace/agriData/video.mp4"
def anim_to_html(anim):
    if not hasattr(anim, '_encoded_video'):
        #with theTempFile as f:
        anim.save(videoFile, writer=animation.FFMpegFileWriter(), fps=2, extra_args=['-vcodec', 'libx264'])
        video = open(videoFile, "rb").read()
        anim._encoded_video = video.encode("base64")
    
    return VIDEO_TAG.format(anim._encoded_video)

from IPython.display import HTML

def display_animation(anim):
    plt.close(anim._fig)
    return HTML(anim_to_html(anim))

from matplotlib import animation

from mpl_toolkits.basemap import Basemap, cm

# create figure and axes instances
fig = plt.figure(figsize=(20,15))
ax = fig.add_axes([0.1,0.1,0.8,0.8])

theMap = Basemap(projection='kav7',lon_0=0,resolution='c')

binList = list()
for year in unique(yieldDataLess.columns):
    datasubset = yieldDataLess[year]
    
    binList.append(doScatter(datasubset, theMap))

# Draw coastlines, state and country boundaries
theMap.drawcoastlines()
theMap.drawcountries()

# Draw the parallels
#parallels = np.arange(0.,90,10.)
#theMap.drawparallels(parallels,labels=[1,0,0,0],fontsize=10)

# Draw meridians
#meridians = np.arange(180.,360.,10.)
#theMap.drawmeridians(meridians,labels=[0,0,0,1],fontsize=10)

# TODO FIXME
# Add the colorbar and label
#colorBar = theMap.colorbar(location='bottom',pad="5%")
#colorBar.set_label('Agriculture Yield [Hg/Ha]')

# Add the title
plt.title("Worldwide Crop Yields over time [UN Reported]")

# This is called sequentially (frame count is i)
def animate(i):
    if i != 0:
        ax.collections.remove(binList[i-1])
    
    ax.collections.append(binList[i])

# Dummy init function prevents animate() from being called an extra time
# and throwing off the collection append/remove scheme
def init():
    return
    
# call the animator.  blit=True means only re-draw the parts that have changed.
anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(binList), blit=False)

# call our new function to display the animation
display_animation(anim)


Out[246]:

Animated "heatmap" using z-score of average country yield per year - Cartopy library

Note: animations do not currently work in Chrome


In [467]:
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import cartopy.io.shapereader as shpreader
import time
import numpy as np

VIDEO_TAG = """<video controls>
 <source src="data:video/x-m4v;base64,{0}" type="video/mp4">
 Your browser does not support the video tag.
</video>"""

videoFile = "/Users/kyledunn/Dropbox/workspace/agriData/video2.mp4"
def anim_to_html(anim, doShow=True):
    if not hasattr(anim, '_encoded_video'):
        #with theTempFile as f:
        anim.save(videoFile, writer="ffmpeg", extra_args=['-vcodec', 'libx264'])
        if doShow:
            video = open(videoFile, "rb").read()
            anim._encoded_video = video.encode("base64")
    
            return VIDEO_TAG.format(anim._encoded_video)
        else:
            return "<p>Video saved to <a href='file://{vid}'>{vid}</a></p>".format(vid=videoFile)

from IPython.display import HTML

def display_animation(anim, doShow=True):
    plt.close(anim._fig)
    return HTML(anim_to_html(anim, doShow))

from matplotlib import animation

# some nice "earthy" colors
earth_colors = np.array([(199, 233, 192),
                                (161, 217, 155),
                                (116, 196, 118),
                                (65, 171, 93),
                                (35, 139, 69),
                                ]) / 255.
earth_colors = itertools.cycle(earth_colors)

# Color map from D3 parcoords z-score rountine
myColorList = ["#DE5E60", "#4682b4", "#4682b4", "#98df8a"]
d3ZscoreCmap = matplotlib.colors.ListedColormap(myColorList, name='d3Zscore', N=18)

#fig = plt.figure(figsize=(16,9))
#ax = plt.axes(projection=ccrs.PlateCarree())

artists = dict()
def plotFor(year):
    global fig
    global ax
    global artists
    global zScores
    
    ax.set_title("Worldwide Crop Yields {} [UN Reported]".format(year))
    
    # Keep track of everything we add for this year
    artists[year] = list()
    
    minScore = min(zScores[year])
    maxScore = max(zScores[year])
    countryScoreRange = maxScore - minScore
    
    for country in shpreader.Reader('ne_110m_admin_0_countries').records():
        #print country.attributes['name_long'], earth_colors.next()
        
        try:
            countryScore = zScores.loc[country.attributes['name_long'], year]
            artists[year].append(ax.add_geometries(country.geometry, ccrs.PlateCarree(),
                      facecolor=d3ZscoreCmap((countryScore + abs(minScore)) / float(countryScoreRange)),
                      label=country.attributes['name_long']))
        except:
            artists[year].append(ax.add_geometries(country.geometry, ccrs.PlateCarree(),
                  facecolor="grey",
                  label=country.attributes['name_long']))
        
        """facecolor=earth_colors.next(),""";
        """facecolor=d3ZscoreCmap(abs(countryScore) / float(countryScoreRange))""";
        
    # Add colorbar, make sure to specify tick locations to match desired ticklabels
    #cbar = fig.colorbar(ax, ticks=[0, abs(minScore), countryScoreRange])
    #cbar.ax.set_yticklabels(['Below average', 'Average', 'Above Average'])# vertically oriented colorbar
    
    return ax

# This is called sequentially (frame count is i)
def animate(i):
    global artists
    
    thisYear = min(zScores.columns)+i
    if i != 0:
        for a in artists[thisYear-1]: a.remove()
    
    return plotFor(thisYear)

# Dummy init function prevents animate() from being called an extra time
# and throwing off the collection append/remove scheme
def init():
    return #[ax]
    
# call the animator.  blit=True means only re-draw the parts that have changed.
#anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(zScores.columns), 
#                               interval=350, blit=False, save_count=len(zScores.columns))

# call our new function to display the animation
#display_animation(anim)

#plotFor(2013)

Animated "heatmap" using z-score of average country yield per year - Pure Matplotlib/Basemap


In [489]:
from mpl_toolkits.basemap import Basemap
from matplotlib.colors import rgb2hex
from matplotlib.patches import Polygon

from scipy.stats import norm

# TODO future work with mpld3 interactive basemap
import mpld3

# Create a figure and axes to plot on
f = figure(figsize=(16,9))
ax2 = plt.subplot(111)

# Use 'hot' colormap
chosenCmap = plt.cm.hot

# Create a basemap with a Kavrayskiy VII projection, 
# centered at 0` longitude, and no geographical resolution 
# (we add our own from shapefile)
m = Basemap(projection='kav7', lon_0=0, resolution=None)

# Fake a contour to in order to create a 'custom' colorbar
cont = contourf(np.array([[0, 0.5, 1], [0, 0, 0]]))

# Align the colorbar with our chosen cmap from above
cont.set_cmap(chosenCmap)

# Add colorbar, specify tick locations, set the labels for those locations and label the units
cbar = f.colorbar(cont, ticks=[0.5])

# Vertically oriented colorbar
cbar.ax.set_yticklabels(['Average'])
cbar.set_label('Relative Agriculture Yield to World Average')    
    
# Load country info shapefile in current working directory
# and read it into the map instance (it handles coordinate projection automagically)
shapefile = 'ne_110m_admin_0_countries'
shp_info = m.readshapefile(shapefile, 'countries', drawbounds=True)

artists = dict()
collectionList = dict()

def myCdf(val, year):
    return zScores[zScores[year] <= val][year].shape[0]/float(zScores[year].shape[0])

def plotFor(year):
    global f
    global ax2
    global cbar
    global artists
    global collectionList
    global zScores
    
    ax2.set_title("Worldwide Crop Yields {} [UN FAO Reported]".format(year))

    # Compute a CDF for mapping values in (0,1) range
    # and use the resulting probabilities to create a 
    # lookup table by z-score
    lookup = dict(zip(zScores[year], [myCdf(i, year) for i in zScores[year].values]))
    
    # Temporary list to plot whole collection at once
    collectionList[year] = list()

    # Iterate through the shapedict, get the current country name, its yield z-score, set its color
    for i, shapedict in enumerate(m.countries_info):

        # Look up the country name
        countryName = shapedict['name_long']

        try:
            # Choose a color for each country based on mean agricultural yield z-score
            score = zScores.loc[countryName, year]
            
            # Handle outliers consistently
            if score > 2:
                score = 1
            elif score < -2:
                score = 0
            else:
                # Map the score to a compressed
                # range in 0.25-0.75, respecting relative distance (as best as possible)
                score = lookup[score]/2 + 0.25

            # Calling colormap with value between 0 and 1 returns
            # rgba value.  (Hot colors are above average yield)        
            rgbColor = chosenCmap( score )[:3]
            
            hexColor = rgb2hex(rgbColor)
        except:
            # Default to black if we have no score
            hexColor = "black"

        # Look up this country's border vertices
        borders = m.countries[i]

        # Draw a polygon around the border, color it according to score
        poly = Polygon(borders, facecolor=hexColor, edgecolor=hexColor)

        # Append the polygon to the patch collection for this year
        collectionList[year].append(poly)

    # Create a patch collection and add it to the axes, respecting colors set per patch
    collection = matplotlib.collections.PatchCollection(collectionList[year], match_original=True)
    artists[year] = ax2.add_collection(collection)
    

# This is called sequentially (frame count is i)
def animate(i):
    global artists
    
    thisYear = min(zScores.columns)+i
    if i != 0:
        artists[thisYear-1].remove()
    
    return plotFor(thisYear)

# Dummy init function prevents animate() from being called an extra time
# and throwing off the collection append/remove scheme
def init():
    return
    
# call the animator.  blit=True means only re-draw the parts that have changed.
anim = animation.FuncAnimation(f, animate, init_func=init, frames=len(zScores.columns), 
                               interval=350, blit=False, save_count=len(zScores.columns))

# call our new function to display the animation
display_animation(anim, doShow=False)




Use mpld3 to make an interactive version with tooltips - Needs work


In [490]:
from mpld3 import plugins, fig_to_html
#plugins.connect(f, plugins.LineLabelTooltip(lines[0]))
#plugins.PointLabelTooltip
tooltip = plugins.PointLabelTooltip(collection, labels=[str(i) for i in range(1,len(patchList)+1)])
plugins.connect(f, tooltip)

HTML(fig_to_html(f))


Out[490]:

In [ ]: