In [45]:
# must go first 
%matplotlib inline 
%config InlineBackend.figure_format='retina'

# plotting
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_context("poster", font_scale=1.3)
import folium

# system packages 
import os, sys
import warnings
warnings.filterwarnings('ignore')

# basic wrangling 
import numpy as np
import pandas as pd

# eda tools 
import pivottablejs
import missingno as msno
import pandas_profiling

# interactive 
import ipywidgets as widgets

# more technical eda
import sklearn
import scipy

In [46]:
sys.path.append('../../scripts/')

In [47]:
from aqua_helper import time_slice, country_slice, time_series, simple_regions, subregion, variable_slice

In [48]:
mpl_update = {'font.size':16,
              'xtick.labelsize':14,
              'ytick.labelsize':14,
              'figure.figsize':[12.0,8.0],
              'axes.color_cycle':['#0055A7', '#2C3E4F', '#26C5ED', '#00cc66', '#D34100', '#FF9700','#091D32'], 
              'axes.labelsize':16,
              'axes.labelcolor':'#677385',
              'axes.titlesize':20,
              'lines.color':'#0055A7',
              'lines.linewidth':3,
              'text.color':'#677385'}
mpl.rcParams.update(mpl_update)

Our plan

Exploratory data analysis consists of the following major tasks, which we present linearly here because each task doesn't make much sense to do without the ones prior to it. However, in reality, you are going to constantly jump around from step to step. You may want to do all the steps for a subset of the variables first. Or often, an observation will bring up a question you want to investigate and you'll branch off and explore to answer that question before returning down the main path of exhaustive EDA.

  1. Form hypotheses/develop investigation themes to explore
  2. Wrangle data
  3. Assess quality of data
  4. Profile data
  5. Explore each individual variable in the dataset
  6. Assess the relationship between each variable and the target
  7. Assess interactions between variables
  8. Explore data across many dimensions

Throughout the entire analysis you want to:

  • Capture a list of hypotheses and questions that come up for further exploration.
  • Record things to watch out for/ be aware of in future analyses.
  • Show intermediate results to colleagues to get a fresh perspective, feedback, domain knowledge. Don't do EDA in a bubble! Get feedback throughout especially from people removed from the problem and/or with relevant domain knowledge.
  • Position visuals and results together. EDA relies on your natural pattern recognition abilities so maximize what you'll find by putting visualizations and results in close proximity.

Data wrangling


In [49]:
data = pd.read_csv('../../data/aquastat/aquastat.csv.gzip', compression='gzip')

# simplify regions
data.region = data.region.apply(lambda x: simple_regions[x])

# remove exploitable fields and national rainfall index
data = data.loc[~data.variable.str.contains('exploitable'),:]
data = data.loc[~(data.variable=='national_rainfall_index')]

Overview of variables


In [50]:
data[['variable','variable_full']].drop_duplicates()


Out[50]:
variable variable_full
0 total_area Total area of the country (1000 ha)
576 arable_land Arable land area (1000 ha)
1152 permanent_crop_area Permanent crops area (1000 ha)
1728 cultivated_area Cultivated area (arable land + permanent crops...
2304 percent_cultivated % of total country area cultivated (%)
2880 total_pop Total population (1000 inhab)
3456 rural_pop Rural population (1000 inhab)
4032 urban_pop Urban population (1000 inhab)
4608 gdp Gross Domestic Product (GDP) (current US$)
5184 gdp_per_capita GDP per capita (current US$/inhab)
5760 agg_to_gdp Agriculture, value added to GDP (%)
6336 human_dev_index Human Development Index (HDI) [highest = 1] (-)
6912 gender_inequal_index Gender Inequality Index (GII) [equality = 0; i...
7488 percent_undernourished Prevalence of undernourishment (3-year average...
8064 number_undernourished Number of people undernourished (3-year averag...
8640 avg_annual_rain_depth Long-term average annual precipitation in dept...
9216 avg_annual_rain_vol Long-term average annual precipitation in volu...
10368 surface_water_produced Surface water produced internally (10^9 m3/year)
10944 groundwater_produced Groundwater produced internally (10^9 m3/year)
11520 surface_groundwater_overlap Overlap between surface water and groundwater ...
12096 irwr Total internal renewable water resources (IRWR...
12672 irwr_per_capita Total internal renewable water resources per c...
13248 surface_entering Surface water: entering the country (total) (1...
13824 surface_inflow_submit_no_treaty Surface water: inflow not submitted to treatie...
14400 surface_inflow_submit_treaty Surface water: inflow submitted to treaties (1...
14976 surface_inflow_secure_treaty Surface water: inflow secured through treaties...
15552 total_flow_border_rivers Surface water: total flow of border rivers (10...
16128 accounted_flow_border_rivers Surface water: accounted flow of border rivers...
16704 accounted_flow Surface water: accounted inflow (10^9 m3/year)
17280 surface_to_other_countries Surface water: leaving the country to other co...
17856 surface_outflow_submit_no_treaty Surface water: outflow to other countries not ...
18432 surface_outflow_submit_treaty Surface water: outflow to other countries subm...
19008 surface_outflow_secure_treaty Surface water: outflow to other countries secu...
19584 surface_total_external_renewable Surface water: total external renewable (10^9 ...
20160 groundwater_entering Groundwater: entering the country (total) (10^...
20736 groundwater_accounted_inflow Groundwater: accounted inflow (10^9 m3/year)
21312 groundwater_to_other_countries Groundwater: leaving the country to other coun...
21888 groundwater_accounted_outflow Groundwater: accounted outflow to other countr...
22464 water_total_external_renewable Water resources: total external renewable (10^...
23040 total_renewable_surface Total renewable surface water (10^9 m3/year)
23616 total_renewable_groundwater Total renewable groundwater (10^9 m3/year)
24192 overlap_surface_groundwater Overlap: between surface water and groundwater...
24768 total_renewable Total renewable water resources (10^9 m3/year)
25344 dependency_ratio Dependency ratio (%)
25920 total_renewable_per_capita Total renewable water resources per capita (m3...
29376 interannual_variability Interannual variability (WRI) (-)
29952 seasonal_variability Seasonal variability (WRI) (-)
30528 total_dam_capacity Total dam capacity (km3)
31104 dam_capacity_per_capita Dam capacity per capita (m3/inhab)
31680 irrigation_potential Irrigation potential (1000 ha)
32256 flood_occurence Flood occurrence (WRI) (-)
32832 total_pop_access_drinking Total population with access to safe drinking-...
33408 rural_pop_access_drinking Rural population with access to safe drinking-...
33984 urban_pop_access_drinking Urban population with access to safe drinking-...

Subset for cross-sectional analysis


In [51]:
recent = time_slice(data, '2013-2017')

Exploring population

Cross-section

For numerical data, look at:

  • Location: mean, median, mode, interquartile mean
  • Spread: standard deviation, variance, range, interquartile range
  • Shape: skewness, kurtosis

Location and spread of the data

Are minimum/maximum values feasible?


In [52]:
recent[['total_pop', 'urban_pop', 'rural_pop']].describe().astype(int)


Out[52]:
2013-2017 total_pop urban_pop rural_pop
count 199 199 199
mean 36890 19849 17040
std 140720 69681 77461
min 0 0 -98
25% 1368 822 500
50% 7595 3967 2404
75% 25088 11656 10677
max 1407306 805387 891112

Rural population is negative... what does that mean?


In [53]:
recent.sort_values('rural_pop')[['total_pop','urban_pop','rural_pop']].head()


Out[53]:
2013-2017 total_pop urban_pop rural_pop
country
Qatar 2235.00 2333.00 -98.00
Singapore 5604.00 5619.00 -15.00
Monaco 37.73 38.32 -0.59
Holy See 0.80 0.80 0.00
Nauru 10.22 10.12 0.10

Rural population = Total population - urban population


In [54]:
time_series(data, 'Qatar', 'total_pop').join(time_series(data, 'Qatar', 'urban_pop')).join(time_series(data, 'Qatar', 'rural_pop'))


Out[54]:
total_pop urban_pop rural_pop
year_measured
1962 56.19 48.39 7.80
1967 86.16 75.48 10.68
1972 130.40 115.60 14.80
1977 182.40 162.40 20.00
1982 277.20 248.60 28.60
1987 423.30 385.40 37.90
1992 489.70 459.10 30.60
1997 528.20 506.50 21.70
2002 634.40 608.90 25.50
2007 1179.00 1130.00 49.00
2012 2016.00 2029.00 -13.00
2015 2235.00 2333.00 -98.00

What to do about non-physical numbers? Remove? Replace with estimates from other sources like the world bank? --> Need to have a clear thought out reason for what you decide to do

We should also start suspecting that the data is skewed:

  • 50% quartile much closer to 25% or 75% in value
  • Large difference between mean and 50%

Shape of the data

  • Is the distribution skewed? Bimodal?
  • Are there outliers? Are they feasible?
  • Are there discontinuities?

In [55]:
recent[['total_pop', 'urban_pop', 'rural_pop']].describe().astype(int)


Out[55]:
2013-2017 total_pop urban_pop rural_pop
count 199 199 199
mean 36890 19849 17040
std 140720 69681 77461
min 0 0 -98
25% 1368 822 500
50% 7595 3967 2404
75% 25088 11656 10677
max 1407306 805387 891112

Yes, it looks like population is skewed. Let's try calculting skewness and kurtosis and plot a histogram to visualize.

Skewness and kurtosis

Skewness: measure of lack of symmetry.

Kurtosis: measure of whether the data are heavily tailed relative to the normal distribution.


In [56]:
recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.skew)


Out[56]:
2013-2017
total_pop    8.519379
urban_pop    8.545690
rural_pop    9.490029
dtype: float64

Skewness for normal distribution should be zero. Negative skewness indicates skew left and positive skewness indicates skew right.


In [57]:
recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.kurtosis)


Out[57]:
2013-2017
total_pop    76.923725
urban_pop    85.499659
rural_pop    95.838930
dtype: float64

Kurtosis is also zero for a normal distribution and can only be postiive. We definitely have some outliers!

The trusty histogram


In [58]:
fig, ax = plt.subplots(figsize=(12, 8))
ax.hist(recent.total_pop.values, bins=50);
ax.set_xlabel('Total population');
ax.set_ylabel('Number of countries');
ax.set_title('Distribution of population of countries 2013-2017');


Let's functionalize this so we can use it again.

Yup, definitely skewed. Why is this a problem?

  • Many models assume a normal (bell-like) curve
  • It is difficult to differentiate observations when skewed (~150/199 look all the same!)

Let's look at a map:


In [59]:
# Extending the code we used for plotting nulls geospatially
def plot_map(df, variable, time_period=None, log=False, 
             legend_name=None, threshold_scale=None):
    geo = r'../../data/aquastat/world.json'
    
    legend_name = legend_name if legend_name else '%s for %s' % (variable, time_period)
    if time_period:
        df = time_slice(df, time_period).reset_index()
    else: 
        df = df.reset_index()
    
    if log:
        df[variable] = df[variable].apply(np.log)
        
    map = folium.Map(location=[34,-45], zoom_start=2,
                     width=1200, height=600)
    map.choropleth(geo_path=geo, 
                   data=df,
                   columns=['country', variable],
                   key_on='feature.properties.name', reset=True,
                   fill_color='PuBuGn', fill_opacity=0.7, line_opacity=0.2,
                   legend_name=legend_name,
                   threshold_scale=threshold_scale)
    return map

In [60]:
plot_map(data, 'total_pop', '2013-2017', legend_name='Total population')


Out[60]:

What is the solution? Often, taking a log transform will make a variable more normal.

Log transform

Does taking the log reduce skewness?


In [61]:
recent[['total_pop']].apply(np.log).apply(scipy.stats.skew)
# recent[['total_pop']].apply(np.log).apply(scipy.stats.skewtest)


Out[61]:
2013-2017
total_pop   -0.899063
dtype: float64

It does reduce skewness but does not make it disappear. How about kurtosis?


In [62]:
recent[['total_pop']].apply(np.log).apply(scipy.stats.kurtosis)
# recent[['total_pop']].apply(np.log).apply(scipy.stats.kurtosistest)


Out[62]:
2013-2017
total_pop    1.086877
dtype: float64

Again, it reduces but does not get rid of kurtosis.

Let's look visually at a histogram of the transformed distribution.

First, let's functionalize the histogram code from earlier and add in the


In [63]:
def plot_hist(df, variable, bins=20, xlabel=None, by=None,
              ylabel=None, title=None, logx=False, ax=None):

    if not ax:
        fig, ax = plt.subplots(figsize=(12,8))
    if logx:
        if df[variable].min() <=0:
            df[variable] = df[variable] - df[variable].min() + 1
            print('Warning: data <=0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
        
        bins = np.logspace(np.log10(df[variable].min()),
                           np.log10(df[variable].max()), bins)
        ax.set_xscale("log")

    ax.hist(df[variable].dropna().values, bins=bins);
    
    if xlabel:
        ax.set_xlabel(xlabel);
    if ylabel:
        ax.set_ylabel(ylabel);
    if title:
        ax.set_title(title);
    
    return ax

In [64]:
plot_hist(recent, 'total_pop', bins=25, logx=True, 
          xlabel='Log of total population', ylabel='Number of countries',
          title='Distribution of total population of countries 2013-2017');



In [65]:
plot_map(data, 'total_pop', '2013-2017', legend_name='Log of total population', log=True)


Out[65]:

Now we can see more variation across countries.

We can see larger countries have larger populations - that makes sense... we may hypothesize that water availability may affect countries with higher population density rather than higher absolute populations.

Normalization is a critical tool in the data scientist's toolbox and is often used for engineering features.

Normalization


In [66]:
recent['population_density'] = recent.total_pop.divide(recent.total_area)

In [67]:
plot_map(recent, 'population_density', legend_name='Population density', 
         threshold_scale=[0,0.3,0.8, 1.8, 78])


Out[67]:

Over time

One country

Check a sample we have familiarity with.

  • Are the units what we think they are?
  • Does this behavior correspond with our pre-existing knowledge?

In [68]:
plt.plot(time_series(data, 'United States of America', 'total_pop'));
plt.xlabel('Year');
plt.ylabel('Population');
plt.title('United States population over time');


One region


In [69]:
with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
    north_america = time_slice(subregion(data, 'North America'), '1958-1962').sort_values('total_pop').index.tolist()
    for country in north_america:
        plt.plot(time_series(data, country, 'total_pop'), label=country);
        plt.xlabel('Year');
        plt.ylabel('Population');
        plt.title('North American populations over time');
    plt.legend(loc=2,prop={'size':10});


This doesn't really tell us anything except North America is the biggest country. We'd like to understand how each country's population changes over time, mostly in reference to itself. Let's normalize again.

What should we normalize by? We could chose a country's minimum, mean, median, maximum... or any other location.

Let's choose minimum so we can see how much each country grows in reference to its starting population.


In [70]:
with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
    for country in north_america:
        ts = time_series(data, country, 'total_pop')
        ts['norm_pop'] = ts.total_pop/ts.total_pop.min()*100
        plt.plot(ts['norm_pop'], label=country);
        plt.xlabel('Year');
        plt.ylabel('Percent increase in population');
        plt.title('Percent increase in population from 1960 in North American countries');
    plt.legend(loc=2,prop={'size':10});


There are too many lines here! We should really only have 3-4 lines on a plot to be able to spot patterns. What we can see, however, is that generally, larger countries grow faster thatn smaller, with two main exceptions. But, we can't figure out which countries those are. Another option is a heatmap.


In [71]:
north_america_pop = variable_slice(subregion(data, 'North America'), 'total_pop')
north_america_norm_pop = north_america_pop.div(north_america_pop.min(axis=1), axis=0)*100
north_america_norm_pop = north_america_norm_pop.loc[north_america]

In [72]:
fig, ax = plt.subplots(figsize=(16, 12));
sns.heatmap(north_america_norm_pop, ax=ax, cmap=sns.light_palette((214, 90, 60), input="husl", as_cmap=True));
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Country, ordered by population in 1960 (<- greatest to least ->)');
plt.title('Percent increase in population from 1960');


Geospatial over time (Interactive widgets)

ipywidgets allow for you to change variables passed through to a function through some kind of widget, such as a selection slider (below), a drop down menu, or more.

To use ipywidgets you must:

  1. Have already run this from the command line: jupyter nbextension enable --py --sys-prefix widgetsnbextension
  2. Define your widget including the possible value options and which value to initialize the widget at.
  3. Define the function that you want to make interactive.
  4. Initialize the interaction through widgets.interact.

We perform all these steps within a function so we can reuse it for different variables:


In [73]:
def map_over_time(df, variable, time_periods, log=False, 
                  threshold_scale=None, legend_name=None):
    
    time_slider = widgets.SelectionSlider(options=time_periods.tolist(),
                                      value=time_periods[0],
                                      description='Time period:',
                                      disabled=False,
                                      button_style='')
    widgets.interact(plot_map, df=widgets.fixed(df), 
                     variable=widgets.fixed(variable),
                     time_period=time_slider, log=widgets.fixed(log), 
                     legend_name=widgets.fixed(legend_name), 
                     threshold_scale=widgets.fixed(threshold_scale));

In [74]:
time_periods = data.time_period.unique()

In [75]:
map_over_time(data, 'total_pop', time_periods, log=True)


Note on exploratory geospatial data analysis

Please note, geospatial data analysis is a whole field of its own and we don't begin to do it justice here. There are many technical assumptions for geospatial modeling that we have not looked at as well as number of numerical methods for exploring it. The PySal package is a good package to start with in the field and tutorial can be found here for introductory geospatial data analysis.

To do: Record questions that came up during the exploration

To do: Perform univariate analysis on an additional variable.

Extras

Exploring total renewable water resources


In [76]:
plot_hist(recent, 'total_renewable', bins=50, 
          xlabel='Total renewable water resources ($10^9 m^3/yr$)',
          ylabel='Number of countries', 
          title='Distribution of total renewable water resources, 2013-2017');



In [77]:
plot_hist(recent, 'total_renewable', bins=50, 
          xlabel='Total renewable water resources ($10^9 m^3/yr$)',
          ylabel='Number of countries', logx=True,
          title='Distribution of total renewable water resources, 2013-2017');



In [78]:
north_america_renew = variable_slice(subregion(data, 'North America'), 'total_renewable')

In [79]:
fig, ax = plt.subplots(figsize=(16, 12));
sns.heatmap(north_america_renew, ax=ax, cmap=sns.light_palette((214, 90, 60), input="husl", as_cmap=True));
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Country, ordered by population in 1960 (<- greatest to least ->)');
plt.title('Percent increase in population from 1960');


Total renewable resources doesn't seem to change over time... let's just check:


In [80]:
north_america_renew.head()


Out[80]:
time_period 1958-1962 1963-1967 1968-1972 1973-1977 1978-1982 1983-1987 1988-1992 1993-1997 1998-2002 2003-2007 2008-2012 2013-2017
country
Antigua and Barbuda 0.052 0.052 0.052 0.052 0.052 0.052 0.052 0.052 0.052 0.052 0.052 0.052
Bahamas 0.700 0.700 0.700 0.700 0.700 0.700 0.700 0.700 0.700 0.700 0.700 0.700
Barbados 0.080 0.080 0.080 0.080 0.080 0.080 0.080 0.080 0.080 0.080 0.080 0.080
Belize 21.730 21.730 21.730 21.730 21.730 21.730 21.730 21.730 21.730 21.730 21.730 21.730
Canada 2902.000 2902.000 2902.000 2902.000 2902.000 2902.000 2902.000 2902.000 2902.000 2902.000 2902.000 2902.000

Just to be sure, let's subtract 1958-1962 values from each period and add up the results:


In [81]:
north_america_renew.sub(north_america_renew.iloc[:,0], axis=0).sum()


Out[81]:
time_period
1958-1962    0.0
1963-1967    0.0
1968-1972    0.0
1973-1977    0.0
1978-1982    0.0
1983-1987    0.0
1988-1992    0.0
1993-1997    0.0
1998-2002    0.0
2003-2007    0.0
2008-2012    0.0
2013-2017    0.0
dtype: float64

Does this apply to the rest of the world?


In [82]:
renew = variable_slice(data, 'total_renewable')

In [83]:
renew.sub(renew.iloc[:,0], axis=0).sum()


Out[83]:
time_period
1958-1962     0.0
1963-1967     0.0
1968-1972     0.0
1973-1977     0.0
1978-1982     0.0
1983-1987     0.0
1988-1992     0.0
1993-1997   -14.0
1998-2002   -14.0
2003-2007   -17.0
2008-2012   -17.0
2013-2017   -17.0
dtype: float64

Uh oh, looks like not. Let's look at it by country instead:


In [84]:
renew.sub(renew.iloc[:,0], axis=0).sum(axis=1).sort_values().head()


Out[84]:
country
Bhutan        -79.0
Afghanistan     0.0
Netherlands     0.0
New Zealand     0.0
Nicaragua       0.0
dtype: float64

Bhutan changed! But no where else. Let's also check the other end of the sorted list:


In [85]:
renew.sub(renew.iloc[:,0], axis=0).sum(axis=1).sort_values().tail(50)


Out[85]:
country
Gambia                                       0.0
Germany                                      0.0
Ghana                                        0.0
Greece                                       0.0
Fiji                                         0.0
Zimbabwe                                     0.0
Armenia                                      NaN
Azerbaijan                                   NaN
Belarus                                      NaN
Bosnia and Herzegovina                       NaN
Cook Islands                                 NaN
Croatia                                      NaN
Czechia                                      NaN
Eritrea                                      NaN
Estonia                                      NaN
Ethiopia                                     NaN
Faroe Islands                                NaN
Georgia                                      NaN
Holy See                                     NaN
Kazakhstan                                   NaN
Kiribati                                     NaN
Kyrgyzstan                                   NaN
Latvia                                       NaN
Liechtenstein                                NaN
Lithuania                                    NaN
Marshall Islands                             NaN
Micronesia (Federated States of)             NaN
Monaco                                       NaN
Montenegro                                   NaN
Nauru                                        NaN
Niue                                         NaN
Palau                                        NaN
Republic of Moldova                          NaN
Russian Federation                           NaN
Samoa                                        NaN
San Marino                                   NaN
Serbia                                       NaN
Seychelles                                   NaN
Slovakia                                     NaN
Slovenia                                     NaN
South Sudan                                  NaN
Sudan                                        NaN
Tajikistan                                   NaN
The former Yugoslav Republic of Macedonia    NaN
Tokelau                                      NaN
Tonga                                        NaN
Turkmenistan                                 NaN
Tuvalu                                       NaN
Ukraine                                      NaN
Uzbekistan                                   NaN
dtype: float64

Oh, we've got NaNs... probably because these countries didn't always exist.

Assessing many variables


In [86]:
def two_hist(df, variable, bins=50,
              ylabel='Number of countries', title=None):

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,8))
    ax1 = plot_hist(df, variable, bins=bins, 
                    xlabel=variable, ylabel=ylabel, 
                    ax=ax1, title=variable if not title else title)
    ax2 = plot_hist(df, variable, bins=bins, 
                    xlabel='Log of '+ variable, ylabel=ylabel, 
                    logx=True, ax=ax2, 
                    title='Log of '+ variable if not title else title)
    plt.close()
    return fig

In [87]:
def hist_over_var(df, variables, bins=50,
                  ylabel='Number of countries', title=None):
    
    variable_slider = widgets.Dropdown(options=variables.tolist(),
                                      value=variables[0],
                                      description='Variable:',
                                      disabled=False,
                                      button_style='')
    widgets.interact(two_hist, df=widgets.fixed(df), 
                     variable=variable_slider, ylabel=widgets.fixed(ylabel),
                     title=widgets.fixed(title), bins=widgets.fixed(bins));

In [88]:
hist_over_var(recent, recent.columns, bins=20)