In [45]:
# must go first
%matplotlib inline
%config InlineBackend.figure_format='retina'
# plotting
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_context("poster", font_scale=1.3)
import folium
# system packages
import os, sys
import warnings
warnings.filterwarnings('ignore')
# basic wrangling
import numpy as np
import pandas as pd
# eda tools
import pivottablejs
import missingno as msno
import pandas_profiling
# interactive
import ipywidgets as widgets
# more technical eda
import sklearn
import scipy
In [46]:
sys.path.append('../../scripts/')
In [47]:
from aqua_helper import time_slice, country_slice, time_series, simple_regions, subregion, variable_slice
In [48]:
mpl_update = {'font.size':16,
'xtick.labelsize':14,
'ytick.labelsize':14,
'figure.figsize':[12.0,8.0],
'axes.color_cycle':['#0055A7', '#2C3E4F', '#26C5ED', '#00cc66', '#D34100', '#FF9700','#091D32'],
'axes.labelsize':16,
'axes.labelcolor':'#677385',
'axes.titlesize':20,
'lines.color':'#0055A7',
'lines.linewidth':3,
'text.color':'#677385'}
mpl.rcParams.update(mpl_update)
Throughout the entire analysis you want to:
In [49]:
data = pd.read_csv('../../data/aquastat/aquastat.csv.gzip', compression='gzip')
# simplify regions
data.region = data.region.apply(lambda x: simple_regions[x])
# remove exploitable fields and national rainfall index
data = data.loc[~data.variable.str.contains('exploitable'),:]
data = data.loc[~(data.variable=='national_rainfall_index')]
Overview of variables
In [50]:
data[['variable','variable_full']].drop_duplicates()
Out[50]:
Subset for cross-sectional analysis
In [51]:
recent = time_slice(data, '2013-2017')
In [52]:
recent[['total_pop', 'urban_pop', 'rural_pop']].describe().astype(int)
Out[52]:
Rural population is negative... what does that mean?
In [53]:
recent.sort_values('rural_pop')[['total_pop','urban_pop','rural_pop']].head()
Out[53]:
Visit glossary for dataset: http://www.fao.org/nr/water/aquastat/data/glossary/search.html?termId=4105&submitBtn=s&cls=yes
Rural population = Total population - urban population
In [54]:
time_series(data, 'Qatar', 'total_pop').join(time_series(data, 'Qatar', 'urban_pop')).join(time_series(data, 'Qatar', 'rural_pop'))
Out[54]:
What to do about non-physical numbers? Remove? Replace with estimates from other sources like the world bank? --> Need to have a clear thought out reason for what you decide to do
We should also start suspecting that the data is skewed:
In [55]:
recent[['total_pop', 'urban_pop', 'rural_pop']].describe().astype(int)
Out[55]:
Yes, it looks like population is skewed. Let's try calculting skewness and kurtosis and plot a histogram to visualize.
Skewness: measure of lack of symmetry.
Kurtosis: measure of whether the data are heavily tailed relative to the normal distribution.
In [56]:
recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.skew)
Out[56]:
Skewness for normal distribution should be zero. Negative skewness indicates skew left and positive skewness indicates skew right.
In [57]:
recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.kurtosis)
Out[57]:
Kurtosis is also zero for a normal distribution and can only be postiive. We definitely have some outliers!
In [58]:
fig, ax = plt.subplots(figsize=(12, 8))
ax.hist(recent.total_pop.values, bins=50);
ax.set_xlabel('Total population');
ax.set_ylabel('Number of countries');
ax.set_title('Distribution of population of countries 2013-2017');
Let's functionalize this so we can use it again.
Yup, definitely skewed. Why is this a problem?
Let's look at a map:
In [59]:
# Extending the code we used for plotting nulls geospatially
def plot_map(df, variable, time_period=None, log=False,
legend_name=None, threshold_scale=None):
geo = r'../../data/aquastat/world.json'
legend_name = legend_name if legend_name else '%s for %s' % (variable, time_period)
if time_period:
df = time_slice(df, time_period).reset_index()
else:
df = df.reset_index()
if log:
df[variable] = df[variable].apply(np.log)
map = folium.Map(location=[34,-45], zoom_start=2,
width=1200, height=600)
map.choropleth(geo_path=geo,
data=df,
columns=['country', variable],
key_on='feature.properties.name', reset=True,
fill_color='PuBuGn', fill_opacity=0.7, line_opacity=0.2,
legend_name=legend_name,
threshold_scale=threshold_scale)
return map
In [60]:
plot_map(data, 'total_pop', '2013-2017', legend_name='Total population')
Out[60]:
What is the solution? Often, taking a log transform will make a variable more normal.
Does taking the log reduce skewness?
In [61]:
recent[['total_pop']].apply(np.log).apply(scipy.stats.skew)
# recent[['total_pop']].apply(np.log).apply(scipy.stats.skewtest)
Out[61]:
It does reduce skewness but does not make it disappear. How about kurtosis?
In [62]:
recent[['total_pop']].apply(np.log).apply(scipy.stats.kurtosis)
# recent[['total_pop']].apply(np.log).apply(scipy.stats.kurtosistest)
Out[62]:
Again, it reduces but does not get rid of kurtosis.
Let's look visually at a histogram of the transformed distribution.
First, let's functionalize the histogram code from earlier and add in the
In [63]:
def plot_hist(df, variable, bins=20, xlabel=None, by=None,
ylabel=None, title=None, logx=False, ax=None):
if not ax:
fig, ax = plt.subplots(figsize=(12,8))
if logx:
if df[variable].min() <=0:
df[variable] = df[variable] - df[variable].min() + 1
print('Warning: data <=0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
bins = np.logspace(np.log10(df[variable].min()),
np.log10(df[variable].max()), bins)
ax.set_xscale("log")
ax.hist(df[variable].dropna().values, bins=bins);
if xlabel:
ax.set_xlabel(xlabel);
if ylabel:
ax.set_ylabel(ylabel);
if title:
ax.set_title(title);
return ax
In [64]:
plot_hist(recent, 'total_pop', bins=25, logx=True,
xlabel='Log of total population', ylabel='Number of countries',
title='Distribution of total population of countries 2013-2017');
In [65]:
plot_map(data, 'total_pop', '2013-2017', legend_name='Log of total population', log=True)
Out[65]:
Now we can see more variation across countries.
We can see larger countries have larger populations - that makes sense... we may hypothesize that water availability may affect countries with higher population density rather than higher absolute populations.
Normalization is a critical tool in the data scientist's toolbox and is often used for engineering features.
In [66]:
recent['population_density'] = recent.total_pop.divide(recent.total_area)
In [67]:
plot_map(recent, 'population_density', legend_name='Population density',
threshold_scale=[0,0.3,0.8, 1.8, 78])
Out[67]:
Check a sample we have familiarity with.
In [68]:
plt.plot(time_series(data, 'United States of America', 'total_pop'));
plt.xlabel('Year');
plt.ylabel('Population');
plt.title('United States population over time');
In [69]:
with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
north_america = time_slice(subregion(data, 'North America'), '1958-1962').sort_values('total_pop').index.tolist()
for country in north_america:
plt.plot(time_series(data, country, 'total_pop'), label=country);
plt.xlabel('Year');
plt.ylabel('Population');
plt.title('North American populations over time');
plt.legend(loc=2,prop={'size':10});
This doesn't really tell us anything except North America is the biggest country. We'd like to understand how each country's population changes over time, mostly in reference to itself. Let's normalize again.
What should we normalize by? We could chose a country's minimum, mean, median, maximum... or any other location.
Let's choose minimum so we can see how much each country grows in reference to its starting population.
In [70]:
with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
for country in north_america:
ts = time_series(data, country, 'total_pop')
ts['norm_pop'] = ts.total_pop/ts.total_pop.min()*100
plt.plot(ts['norm_pop'], label=country);
plt.xlabel('Year');
plt.ylabel('Percent increase in population');
plt.title('Percent increase in population from 1960 in North American countries');
plt.legend(loc=2,prop={'size':10});
There are too many lines here! We should really only have 3-4 lines on a plot to be able to spot patterns. What we can see, however, is that generally, larger countries grow faster thatn smaller, with two main exceptions. But, we can't figure out which countries those are. Another option is a heatmap.
In [71]:
north_america_pop = variable_slice(subregion(data, 'North America'), 'total_pop')
north_america_norm_pop = north_america_pop.div(north_america_pop.min(axis=1), axis=0)*100
north_america_norm_pop = north_america_norm_pop.loc[north_america]
In [72]:
fig, ax = plt.subplots(figsize=(16, 12));
sns.heatmap(north_america_norm_pop, ax=ax, cmap=sns.light_palette((214, 90, 60), input="husl", as_cmap=True));
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Country, ordered by population in 1960 (<- greatest to least ->)');
plt.title('Percent increase in population from 1960');
ipywidgets
allow for you to change variables passed through to a function through some kind of widget, such as a selection slider (below), a drop down menu, or more.
To use ipywidgets
you must:
jupyter nbextension enable --py --sys-prefix widgetsnbextension
widgets.interact
.We perform all these steps within a function so we can reuse it for different variables:
In [73]:
def map_over_time(df, variable, time_periods, log=False,
threshold_scale=None, legend_name=None):
time_slider = widgets.SelectionSlider(options=time_periods.tolist(),
value=time_periods[0],
description='Time period:',
disabled=False,
button_style='')
widgets.interact(plot_map, df=widgets.fixed(df),
variable=widgets.fixed(variable),
time_period=time_slider, log=widgets.fixed(log),
legend_name=widgets.fixed(legend_name),
threshold_scale=widgets.fixed(threshold_scale));
In [74]:
time_periods = data.time_period.unique()
In [75]:
map_over_time(data, 'total_pop', time_periods, log=True)
Please note, geospatial data analysis is a whole field of its own and we don't begin to do it justice here. There are many technical assumptions for geospatial modeling that we have not looked at as well as number of numerical methods for exploring it. The PySal package is a good package to start with in the field and tutorial can be found here for introductory geospatial data analysis.
In [76]:
plot_hist(recent, 'total_renewable', bins=50,
xlabel='Total renewable water resources ($10^9 m^3/yr$)',
ylabel='Number of countries',
title='Distribution of total renewable water resources, 2013-2017');
In [77]:
plot_hist(recent, 'total_renewable', bins=50,
xlabel='Total renewable water resources ($10^9 m^3/yr$)',
ylabel='Number of countries', logx=True,
title='Distribution of total renewable water resources, 2013-2017');
In [78]:
north_america_renew = variable_slice(subregion(data, 'North America'), 'total_renewable')
In [79]:
fig, ax = plt.subplots(figsize=(16, 12));
sns.heatmap(north_america_renew, ax=ax, cmap=sns.light_palette((214, 90, 60), input="husl", as_cmap=True));
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Country, ordered by population in 1960 (<- greatest to least ->)');
plt.title('Percent increase in population from 1960');
Total renewable resources doesn't seem to change over time... let's just check:
In [80]:
north_america_renew.head()
Out[80]:
Just to be sure, let's subtract 1958-1962 values from each period and add up the results:
In [81]:
north_america_renew.sub(north_america_renew.iloc[:,0], axis=0).sum()
Out[81]:
Does this apply to the rest of the world?
In [82]:
renew = variable_slice(data, 'total_renewable')
In [83]:
renew.sub(renew.iloc[:,0], axis=0).sum()
Out[83]:
Uh oh, looks like not. Let's look at it by country instead:
In [84]:
renew.sub(renew.iloc[:,0], axis=0).sum(axis=1).sort_values().head()
Out[84]:
Bhutan changed! But no where else. Let's also check the other end of the sorted list:
In [85]:
renew.sub(renew.iloc[:,0], axis=0).sum(axis=1).sort_values().tail(50)
Out[85]:
Oh, we've got NaNs... probably because these countries didn't always exist.
In [86]:
def two_hist(df, variable, bins=50,
ylabel='Number of countries', title=None):
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,8))
ax1 = plot_hist(df, variable, bins=bins,
xlabel=variable, ylabel=ylabel,
ax=ax1, title=variable if not title else title)
ax2 = plot_hist(df, variable, bins=bins,
xlabel='Log of '+ variable, ylabel=ylabel,
logx=True, ax=ax2,
title='Log of '+ variable if not title else title)
plt.close()
return fig
In [87]:
def hist_over_var(df, variables, bins=50,
ylabel='Number of countries', title=None):
variable_slider = widgets.Dropdown(options=variables.tolist(),
value=variables[0],
description='Variable:',
disabled=False,
button_style='')
widgets.interact(two_hist, df=widgets.fixed(df),
variable=variable_slider, ylabel=widgets.fixed(ylabel),
title=widgets.fixed(title), bins=widgets.fixed(bins));
In [88]:
hist_over_var(recent, recent.columns, bins=20)