In [45]:

    
# must go first 
%matplotlib inline 
%config InlineBackend.figure_format='retina'

# plotting
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_context("poster", font_scale=1.3)
import folium

# system packages 
import os, sys
import warnings
warnings.filterwarnings('ignore')

# basic wrangling 
import numpy as np
import pandas as pd

# eda tools 
import pivottablejs
import missingno as msno
import pandas_profiling

# interactive 
import ipywidgets as widgets

# more technical eda
import sklearn
import scipy



In [46]:

    
sys.path.append('../../scripts/')



In [47]:

    
from aqua_helper import time_slice, country_slice, time_series, simple_regions, subregion, variable_slice



In [48]:

    
mpl_update = {'font.size':16,
              'xtick.labelsize':14,
              'ytick.labelsize':14,
              'figure.figsize':[12.0,8.0],
              'axes.color_cycle':['#0055A7', '#2C3E4F', '#26C5ED', '#00cc66', '#D34100', '#FF9700','#091D32'], 
              'axes.labelsize':16,
              'axes.labelcolor':'#677385',
              'axes.titlesize':20,
              'lines.color':'#0055A7',
              'lines.linewidth':3,
              'text.color':'#677385'}
mpl.rcParams.update(mpl_update)

Our plan

Exploratory data analysis consists of the following major tasks, which we present linearly here because each task doesn't make much sense to do without the ones prior to it. However, in reality, you are going to constantly jump around from step to step. You may want to do all the steps for a subset of the variables first. Or often, an observation will bring up a question you want to investigate and you'll branch off and explore to answer that question before returning down the main path of exhaustive EDA.

Form hypotheses/develop investigation themes to explore
Wrangle data
Assess quality of data
Profile data
Explore each individual variable in the dataset
Assess the relationship between each variable and the target
Assess interactions between variables
Explore data across many dimensions

Throughout the entire analysis you want to:

Capture a list of hypotheses and questions that come up for further exploration.
Record things to watch out for/ be aware of in future analyses.
Show intermediate results to colleagues to get a fresh perspective, feedback, domain knowledge. Don't do EDA in a bubble! Get feedback throughout especially from people removed from the problem and/or with relevant domain knowledge.
Position visuals and results together. EDA relies on your natural pattern recognition abilities so maximize what you'll find by putting visualizations and results in close proximity.

Data wrangling

http://www.fao.org/nr/water/aquastat/data/query/index.html



In [49]:

    
data = pd.read_csv('../../data/aquastat/aquastat.csv.gzip', compression='gzip')

# simplify regions
data.region = data.region.apply(lambda x: simple_regions[x])

# remove exploitable fields and national rainfall index
data = data.loc[~data.variable.str.contains('exploitable'),:]
data = data.loc[~(data.variable=='national_rainfall_index')]

Overview of variables



In [50]:

    
data[['variable','variable_full']].drop_duplicates()









    Out[50]:






  
    
      
      variable
      variable_full
    
  
  
    
      0
      total_area
      Total area of the country (1000 ha)
    
    
      576
      arable_land
      Arable land area (1000 ha)
    
    
      1152
      permanent_crop_area
      Permanent crops area (1000 ha)
    
    
      1728
      cultivated_area
      Cultivated area (arable land + permanent crops...
    
    
      2304
      percent_cultivated
      % of total country area cultivated (%)
    
    
      2880
      total_pop
      Total population (1000 inhab)
    
    
      3456
      rural_pop
      Rural population (1000 inhab)
    
    
      4032
      urban_pop
      Urban population (1000 inhab)
    
    
      4608
      gdp
      Gross Domestic Product (GDP) (current US$)
    
    
      5184
      gdp_per_capita
      GDP per capita (current US$/inhab)
    
    
      5760
      agg_to_gdp
      Agriculture, value added to GDP (%)
    
    
      6336
      human_dev_index
      Human Development Index (HDI) [highest = 1] (-)
    
    
      6912
      gender_inequal_index
      Gender Inequality Index (GII) [equality = 0; i...
    
    
      7488
      percent_undernourished
      Prevalence of undernourishment (3-year average...
    
    
      8064
      number_undernourished
      Number of people undernourished (3-year averag...
    
    
      8640
      avg_annual_rain_depth
      Long-term average annual precipitation in dept...
    
    
      9216
      avg_annual_rain_vol
      Long-term average annual precipitation in volu...
    
    
      10368
      surface_water_produced
      Surface water produced internally (10^9 m3/year)
    
    
      10944
      groundwater_produced
      Groundwater produced internally (10^9 m3/year)
    
    
      11520
      surface_groundwater_overlap
      Overlap between surface water and groundwater ...
    
    
      12096
      irwr
      Total internal renewable water resources (IRWR...
    
    
      12672
      irwr_per_capita
      Total internal renewable water resources per c...
    
    
      13248
      surface_entering
      Surface water: entering the country (total) (1...
    
    
      13824
      surface_inflow_submit_no_treaty
      Surface water: inflow not submitted to treatie...
    
    
      14400
      surface_inflow_submit_treaty
      Surface water: inflow submitted to treaties (1...
    
    
      14976
      surface_inflow_secure_treaty
      Surface water: inflow secured through treaties...
    
    
      15552
      total_flow_border_rivers
      Surface water: total flow of border rivers (10...
    
    
      16128
      accounted_flow_border_rivers
      Surface water: accounted flow of border rivers...
    
    
      16704
      accounted_flow
      Surface water: accounted inflow (10^9 m3/year)
    
    
      17280
      surface_to_other_countries
      Surface water: leaving the country to other co...
    
    
      17856
      surface_outflow_submit_no_treaty
      Surface water: outflow to other countries not ...
    
    
      18432
      surface_outflow_submit_treaty
      Surface water: outflow to other countries subm...
    
    
      19008
      surface_outflow_secure_treaty
      Surface water: outflow to other countries secu...
    
    
      19584
      surface_total_external_renewable
      Surface water: total external renewable (10^9 ...
    
    
      20160
      groundwater_entering
      Groundwater: entering the country (total) (10^...
    
    
      20736
      groundwater_accounted_inflow
      Groundwater: accounted inflow (10^9 m3/year)
    
    
      21312
      groundwater_to_other_countries
      Groundwater: leaving the country to other coun...
    
    
      21888
      groundwater_accounted_outflow
      Groundwater: accounted outflow to other countr...
    
    
      22464
      water_total_external_renewable
      Water resources: total external renewable (10^...
    
    
      23040
      total_renewable_surface
      Total renewable surface water (10^9 m3/year)
    
    
      23616
      total_renewable_groundwater
      Total renewable groundwater (10^9 m3/year)
    
    
      24192
      overlap_surface_groundwater
      Overlap: between surface water and groundwater...
    
    
      24768
      total_renewable
      Total renewable water resources (10^9 m3/year)
    
    
      25344
      dependency_ratio
      Dependency ratio (%)
    
    
      25920
      total_renewable_per_capita
      Total renewable water resources per capita (m3...
    
    
      29376
      interannual_variability
      Interannual variability (WRI) (-)
    
    
      29952
      seasonal_variability
      Seasonal variability (WRI) (-)
    
    
      30528
      total_dam_capacity
      Total dam capacity (km3)
    
    
      31104
      dam_capacity_per_capita
      Dam capacity per capita (m3/inhab)
    
    
      31680
      irrigation_potential
      Irrigation potential (1000 ha)
    
    
      32256
      flood_occurence
      Flood occurrence (WRI) (-)
    
    
      32832
      total_pop_access_drinking
      Total population with access to safe drinking-...
    
    
      33408
      rural_pop_access_drinking
      Rural population with access to safe drinking-...
    
    
      33984
      urban_pop_access_drinking
      Urban population with access to safe drinking-...

Subset for cross-sectional analysis



In [51]:

    
recent = time_slice(data, '2013-2017')

Exploring population

Cross-section

For numerical data, look at:

Location: mean, median, mode, interquartile mean
Spread: standard deviation, variance, range, interquartile range
Shape: skewness, kurtosis

Location and spread of the data

Are minimum/maximum values feasible?



In [52]:

    
recent[['total_pop', 'urban_pop', 'rural_pop']].describe().astype(int)

Rural population is negative... what does that mean?



In [53]:

    
recent.sort_values('rural_pop')[['total_pop','urban_pop','rural_pop']].head()









    Out[53]:






  
    
      2013-2017
      total_pop
      urban_pop
      rural_pop
    
    
      country
      
      
      
    
  
  
    
      Qatar
      2235.00
      2333.00
      -98.00
    
    
      Singapore
      5604.00
      5619.00
      -15.00
    
    
      Monaco
      37.73
      38.32
      -0.59
    
    
      Holy See
      0.80
      0.80
      0.00
    
    
      Nauru
      10.22
      10.12
      0.10

Visit glossary for dataset: http://www.fao.org/nr/water/aquastat/data/glossary/search.html?termId=4105&submitBtn=s&cls=yes

Rural population = Total population - urban population



In [54]:

    
time_series(data, 'Qatar', 'total_pop').join(time_series(data, 'Qatar', 'urban_pop')).join(time_series(data, 'Qatar', 'rural_pop'))









    Out[54]:






  
    
      
      total_pop
      urban_pop
      rural_pop
    
    
      year_measured
      
      
      
    
  
  
    
      1962
      56.19
      48.39
      7.80
    
    
      1967
      86.16
      75.48
      10.68
    
    
      1972
      130.40
      115.60
      14.80
    
    
      1977
      182.40
      162.40
      20.00
    
    
      1982
      277.20
      248.60
      28.60
    
    
      1987
      423.30
      385.40
      37.90
    
    
      1992
      489.70
      459.10
      30.60
    
    
      1997
      528.20
      506.50
      21.70
    
    
      2002
      634.40
      608.90
      25.50
    
    
      2007
      1179.00
      1130.00
      49.00
    
    
      2012
      2016.00
      2029.00
      -13.00
    
    
      2015
      2235.00
      2333.00
      -98.00

What to do about non-physical numbers? Remove? Replace with estimates from other sources like the world bank? --> Need to have a clear thought out reason for what you decide to do

We should also start suspecting that the data is skewed:

50% quartile much closer to 25% or 75% in value
Large difference between mean and 50%

Shape of the data

Is the distribution skewed? Bimodal?
Are there outliers? Are they feasible?
Are there discontinuities?



In [55]:

    
recent[['total_pop', 'urban_pop', 'rural_pop']].describe().astype(int)

Yes, it looks like population is skewed. Let's try calculting skewness and kurtosis and plot a histogram to visualize.

Skewness and kurtosis

Skewness: measure of lack of symmetry.

Kurtosis: measure of whether the data are heavily tailed relative to the normal distribution.



In [56]:

    
recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.skew)









    Out[56]:





2013-2017
total_pop    8.519379
urban_pop    8.545690
rural_pop    9.490029
dtype: float64

Skewness for normal distribution should be zero. Negative skewness indicates skew left and positive skewness indicates skew right.



In [57]:

    
recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.kurtosis)









    Out[57]:





2013-2017
total_pop    76.923725
urban_pop    85.499659
rural_pop    95.838930
dtype: float64

Kurtosis is also zero for a normal distribution and can only be postiive. We definitely have some outliers!

The trusty histogram



In [58]:

    
fig, ax = plt.subplots(figsize=(12, 8))
ax.hist(recent.total_pop.values, bins=50);
ax.set_xlabel('Total population');
ax.set_ylabel('Number of countries');
ax.set_title('Distribution of population of countries 2013-2017');

Let's functionalize this so we can use it again.

Yup, definitely skewed. Why is this a problem?

Many models assume a normal (bell-like) curve
It is difficult to differentiate observations when skewed (~150/199 look all the same!)

Let's look at a map:



In [59]:

    
# Extending the code we used for plotting nulls geospatially
def plot_map(df, variable, time_period=None, log=False, 
             legend_name=None, threshold_scale=None):
    geo = r'../../data/aquastat/world.json'
    
    legend_name = legend_name if legend_name else '%s for %s' % (variable, time_period)
    if time_period:
        df = time_slice(df, time_period).reset_index()
    else: 
        df = df.reset_index()
    
    if log:
        df[variable] = df[variable].apply(np.log)
        
    map = folium.Map(location=[34,-45], zoom_start=2,
                     width=1200, height=600)
    map.choropleth(geo_path=geo, 
                   data=df,
                   columns=['country', variable],
                   key_on='feature.properties.name', reset=True,
                   fill_color='PuBuGn', fill_opacity=0.7, line_opacity=0.2,
                   legend_name=legend_name,
                   threshold_scale=threshold_scale)
    return map



In [60]:

    
plot_map(data, 'total_pop', '2013-2017', legend_name='Total population')









    Out[60]:

What is the solution? Often, taking a log transform will make a variable more normal.

Log transform

Does taking the log reduce skewness?



In [61]:

    
recent[['total_pop']].apply(np.log).apply(scipy.stats.skew)
# recent[['total_pop']].apply(np.log).apply(scipy.stats.skewtest)









    Out[61]:





2013-2017
total_pop   -0.899063
dtype: float64

It does reduce skewness but does not make it disappear. How about kurtosis?



In [62]:

    
recent[['total_pop']].apply(np.log).apply(scipy.stats.kurtosis)
# recent[['total_pop']].apply(np.log).apply(scipy.stats.kurtosistest)









    Out[62]:





2013-2017
total_pop    1.086877
dtype: float64

Again, it reduces but does not get rid of kurtosis.

Let's look visually at a histogram of the transformed distribution.

First, let's functionalize the histogram code from earlier and add in the



In [63]:

    
def plot_hist(df, variable, bins=20, xlabel=None, by=None,
              ylabel=None, title=None, logx=False, ax=None):

    if not ax:
        fig, ax = plt.subplots(figsize=(12,8))
    if logx:
        if df[variable].min() <=0:
            df[variable] = df[variable] - df[variable].min() + 1
            print('Warning: data <=0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
        
        bins = np.logspace(np.log10(df[variable].min()),
                           np.log10(df[variable].max()), bins)
        ax.set_xscale("log")

    ax.hist(df[variable].dropna().values, bins=bins);
    
    if xlabel:
        ax.set_xlabel(xlabel);
    if ylabel:
        ax.set_ylabel(ylabel);
    if title:
        ax.set_title(title);
    
    return ax



In [64]:

    
plot_hist(recent, 'total_pop', bins=25, logx=True, 
          xlabel='Log of total population', ylabel='Number of countries',
          title='Distribution of total population of countries 2013-2017');



In [65]:

    
plot_map(data, 'total_pop', '2013-2017', legend_name='Log of total population', log=True)









    Out[65]:

Now we can see more variation across countries.

We can see larger countries have larger populations - that makes sense... we may hypothesize that water availability may affect countries with higher population density rather than higher absolute populations.

Normalization is a critical tool in the data scientist's toolbox and is often used for engineering features.

Normalization



In [66]:

    
recent['population_density'] = recent.total_pop.divide(recent.total_area)



In [67]:

    
plot_map(recent, 'population_density', legend_name='Population density', 
         threshold_scale=[0,0.3,0.8, 1.8, 78])









    Out[67]:

Over time

One country

Check a sample we have familiarity with.

Are the units what we think they are?
Does this behavior correspond with our pre-existing knowledge?



In [68]:

    
plt.plot(time_series(data, 'United States of America', 'total_pop'));
plt.xlabel('Year');
plt.ylabel('Population');
plt.title('United States population over time');

One region



In [69]:

    
with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
    north_america = time_slice(subregion(data, 'North America'), '1958-1962').sort_values('total_pop').index.tolist()
    for country in north_america:
        plt.plot(time_series(data, country, 'total_pop'), label=country);
        plt.xlabel('Year');
        plt.ylabel('Population');
        plt.title('North American populations over time');
    plt.legend(loc=2,prop={'size':10});

This doesn't really tell us anything except North America is the biggest country. We'd like to understand how each country's population changes over time, mostly in reference to itself. Let's normalize again.

What should we normalize by? We could chose a country's minimum, mean, median, maximum... or any other location.

Let's choose minimum so we can see how much each country grows in reference to its starting population.



In [70]:

    
with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
    for country in north_america:
        ts = time_series(data, country, 'total_pop')
        ts['norm_pop'] = ts.total_pop/ts.total_pop.min()*100
        plt.plot(ts['norm_pop'], label=country);
        plt.xlabel('Year');
        plt.ylabel('Percent increase in population');
        plt.title('Percent increase in population from 1960 in North American countries');
    plt.legend(loc=2,prop={'size':10});

There are too many lines here! We should really only have 3-4 lines on a plot to be able to spot patterns. What we can see, however, is that generally, larger countries grow faster thatn smaller, with two main exceptions. But, we can't figure out which countries those are. Another option is a heatmap.



In [71]:

    
north_america_pop = variable_slice(subregion(data, 'North America'), 'total_pop')
north_america_norm_pop = north_america_pop.div(north_america_pop.min(axis=1), axis=0)*100
north_america_norm_pop = north_america_norm_pop.loc[north_america]



In [72]:

    
fig, ax = plt.subplots(figsize=(16, 12));
sns.heatmap(north_america_norm_pop, ax=ax, cmap=sns.light_palette((214, 90, 60), input="husl", as_cmap=True));
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Country, ordered by population in 1960 (<- greatest to least ->)');
plt.title('Percent increase in population from 1960');

Geospatial over time (Interactive widgets)

ipywidgets allow for you to change variables passed through to a function through some kind of widget, such as a selection slider (below), a drop down menu, or more.

To use ipywidgets you must:

Have already run this from the command line: jupyter nbextension enable --py --sys-prefix widgetsnbextension
Define your widget including the possible value options and which value to initialize the widget at.
Define the function that you want to make interactive.
Initialize the interaction through widgets.interact.

We perform all these steps within a function so we can reuse it for different variables:



In [73]:

    
def map_over_time(df, variable, time_periods, log=False, 
                  threshold_scale=None, legend_name=None):
    
    time_slider = widgets.SelectionSlider(options=time_periods.tolist(),
                                      value=time_periods[0],
                                      description='Time period:',
                                      disabled=False,
                                      button_style='')
    widgets.interact(plot_map, df=widgets.fixed(df), 
                     variable=widgets.fixed(variable),
                     time_period=time_slider, log=widgets.fixed(log), 
                     legend_name=widgets.fixed(legend_name), 
                     threshold_scale=widgets.fixed(threshold_scale));



In [74]:

    
time_periods = data.time_period.unique()



In [75]:

    
map_over_time(data, 'total_pop', time_periods, log=True)

Note on exploratory geospatial data analysis

Please note, geospatial data analysis is a whole field of its own and we don't begin to do it justice here. There are many technical assumptions for geospatial modeling that we have not looked at as well as number of numerical methods for exploring it. The PySal package is a good package to start with in the field and tutorial can be found here for introductory geospatial data analysis.

To do: Record questions that came up during the exploration

To do: Perform univariate analysis on an additional variable.

Extras

Exploring total renewable water resources



In [76]:

    
plot_hist(recent, 'total_renewable', bins=50, 
          xlabel='Total renewable water resources ($10^9 m^3/yr$)',
          ylabel='Number of countries', 
          title='Distribution of total renewable water resources, 2013-2017');



In [77]:

    
plot_hist(recent, 'total_renewable', bins=50, 
          xlabel='Total renewable water resources ($10^9 m^3/yr$)',
          ylabel='Number of countries', logx=True,
          title='Distribution of total renewable water resources, 2013-2017');



In [78]:

    
north_america_renew = variable_slice(subregion(data, 'North America'), 'total_renewable')



In [79]:

    
fig, ax = plt.subplots(figsize=(16, 12));
sns.heatmap(north_america_renew, ax=ax, cmap=sns.light_palette((214, 90, 60), input="husl", as_cmap=True));
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Country, ordered by population in 1960 (<- greatest to least ->)');
plt.title('Percent increase in population from 1960');

Total renewable resources doesn't seem to change over time... let's just check:



In [80]:

    
north_america_renew.head()









    Out[80]:






  
    
      time_period
      1958-1962
      1963-1967
      1968-1972
      1973-1977
      1978-1982
      1983-1987
      1988-1992
      1993-1997
      1998-2002
      2003-2007
      2008-2012
      2013-2017
    
    
      country
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Antigua and Barbuda
      0.052
      0.052
      0.052
      0.052
      0.052
      0.052
      0.052
      0.052
      0.052
      0.052
      0.052
      0.052
    
    
      Bahamas
      0.700
      0.700
      0.700
      0.700
      0.700
      0.700
      0.700
      0.700
      0.700
      0.700
      0.700
      0.700
    
    
      Barbados
      0.080
      0.080
      0.080
      0.080
      0.080
      0.080
      0.080
      0.080
      0.080
      0.080
      0.080
      0.080
    
    
      Belize
      21.730
      21.730
      21.730
      21.730
      21.730
      21.730
      21.730
      21.730
      21.730
      21.730
      21.730
      21.730
    
    
      Canada
      2902.000
      2902.000
      2902.000
      2902.000
      2902.000
      2902.000
      2902.000
      2902.000
      2902.000
      2902.000
      2902.000
      2902.000

Just to be sure, let's subtract 1958-1962 values from each period and add up the results:



In [81]:

    
north_america_renew.sub(north_america_renew.iloc[:,0], axis=0).sum()









    Out[81]:





time_period
1958-1962    0.0
1963-1967    0.0
1968-1972    0.0
1973-1977    0.0
1978-1982    0.0
1983-1987    0.0
1988-1992    0.0
1993-1997    0.0
1998-2002    0.0
2003-2007    0.0
2008-2012    0.0
2013-2017    0.0
dtype: float64

Does this apply to the rest of the world?



In [82]:

    
renew = variable_slice(data, 'total_renewable')



In [83]:

    
renew.sub(renew.iloc[:,0], axis=0).sum()









    Out[83]:





time_period
1958-1962     0.0
1963-1967     0.0
1968-1972     0.0
1973-1977     0.0
1978-1982     0.0
1983-1987     0.0
1988-1992     0.0
1993-1997   -14.0
1998-2002   -14.0
2003-2007   -17.0
2008-2012   -17.0
2013-2017   -17.0
dtype: float64

Uh oh, looks like not. Let's look at it by country instead:



In [84]:

    
renew.sub(renew.iloc[:,0], axis=0).sum(axis=1).sort_values().head()









    Out[84]:





country
Bhutan        -79.0
Afghanistan     0.0
Netherlands     0.0
New Zealand     0.0
Nicaragua       0.0
dtype: float64

Bhutan changed! But no where else. Let's also check the other end of the sorted list:



In [85]:

    
renew.sub(renew.iloc[:,0], axis=0).sum(axis=1).sort_values().tail(50)









    Out[85]:





country
Gambia                                       0.0
Germany                                      0.0
Ghana                                        0.0
Greece                                       0.0
Fiji                                         0.0
Zimbabwe                                     0.0
Armenia                                      NaN
Azerbaijan                                   NaN
Belarus                                      NaN
Bosnia and Herzegovina                       NaN
Cook Islands                                 NaN
Croatia                                      NaN
Czechia                                      NaN
Eritrea                                      NaN
Estonia                                      NaN
Ethiopia                                     NaN
Faroe Islands                                NaN
Georgia                                      NaN
Holy See                                     NaN
Kazakhstan                                   NaN
Kiribati                                     NaN
Kyrgyzstan                                   NaN
Latvia                                       NaN
Liechtenstein                                NaN
Lithuania                                    NaN
Marshall Islands                             NaN
Micronesia (Federated States of)             NaN
Monaco                                       NaN
Montenegro                                   NaN
Nauru                                        NaN
Niue                                         NaN
Palau                                        NaN
Republic of Moldova                          NaN
Russian Federation                           NaN
Samoa                                        NaN
San Marino                                   NaN
Serbia                                       NaN
Seychelles                                   NaN
Slovakia                                     NaN
Slovenia                                     NaN
South Sudan                                  NaN
Sudan                                        NaN
Tajikistan                                   NaN
The former Yugoslav Republic of Macedonia    NaN
Tokelau                                      NaN
Tonga                                        NaN
Turkmenistan                                 NaN
Tuvalu                                       NaN
Ukraine                                      NaN
Uzbekistan                                   NaN
dtype: float64

Oh, we've got NaNs... probably because these countries didn't always exist.

Assessing many variables



In [86]:

    
def two_hist(df, variable, bins=50,
              ylabel='Number of countries', title=None):

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,8))
    ax1 = plot_hist(df, variable, bins=bins, 
                    xlabel=variable, ylabel=ylabel, 
                    ax=ax1, title=variable if not title else title)
    ax2 = plot_hist(df, variable, bins=bins, 
                    xlabel='Log of '+ variable, ylabel=ylabel, 
                    logx=True, ax=ax2, 
                    title='Log of '+ variable if not title else title)
    plt.close()
    return fig



In [87]:

    
def hist_over_var(df, variables, bins=50,
                  ylabel='Number of countries', title=None):
    
    variable_slider = widgets.Dropdown(options=variables.tolist(),
                                      value=variables[0],
                                      description='Variable:',
                                      disabled=False,
                                      button_style='')
    widgets.interact(two_hist, df=widgets.fixed(df), 
                     variable=variable_slider, ylabel=widgets.fixed(ylabel),
                     title=widgets.fixed(title), bins=widgets.fixed(bins));



In [88]:

    
hist_over_var(recent, recent.columns, bins=20)

2013-2017	total_pop	urban_pop	rural_pop
count	199	199	199
mean	36890	19849	17040
std	140720	69681	77461
min	0	0	-98
25%	1368	822	500
50%	7595	3967	2404
75%	25088	11656	10677
max	1407306	805387	891112

2013-2017	total_pop	urban_pop	rural_pop
count	199	199	199
mean	36890	19849	17040
std	140720	69681	77461
min	0	0	-98
25%	1368	822	500
50%	7595	3967	2404
75%	25088	11656	10677
max	1407306	805387	891112

	variable	variable_full
0	total_area	Total area of the country (1000 ha)
576	arable_land	Arable land area (1000 ha)
1152	permanent_crop_area	Permanent crops area (1000 ha)
1728	cultivated_area	Cultivated area (arable land + permanent crops...
2304	percent_cultivated	% of total country area cultivated (%)
2880	total_pop	Total population (1000 inhab)
3456	rural_pop	Rural population (1000 inhab)
4032	urban_pop	Urban population (1000 inhab)
4608	gdp	Gross Domestic Product (GDP) (current US$)
5184	gdp_per_capita	GDP per capita (current US$/inhab)
5760	agg_to_gdp	Agriculture, value added to GDP (%)
6336	human_dev_index	Human Development Index (HDI) [highest = 1] (-)
6912	gender_inequal_index	Gender Inequality Index (GII) [equality = 0; i...
7488	percent_undernourished	Prevalence of undernourishment (3-year average...
8064	number_undernourished	Number of people undernourished (3-year averag...
8640	avg_annual_rain_depth	Long-term average annual precipitation in dept...
9216	avg_annual_rain_vol	Long-term average annual precipitation in volu...
10368	surface_water_produced	Surface water produced internally (10^9 m3/year)
10944	groundwater_produced	Groundwater produced internally (10^9 m3/year)
11520	surface_groundwater_overlap	Overlap between surface water and groundwater ...
12096	irwr	Total internal renewable water resources (IRWR...
12672	irwr_per_capita	Total internal renewable water resources per c...
13248	surface_entering	Surface water: entering the country (total) (1...
13824	surface_inflow_submit_no_treaty	Surface water: inflow not submitted to treatie...
14400	surface_inflow_submit_treaty	Surface water: inflow submitted to treaties (1...
14976	surface_inflow_secure_treaty	Surface water: inflow secured through treaties...
15552	total_flow_border_rivers	Surface water: total flow of border rivers (10...
16128	accounted_flow_border_rivers	Surface water: accounted flow of border rivers...
16704	accounted_flow	Surface water: accounted inflow (10^9 m3/year)
17280	surface_to_other_countries	Surface water: leaving the country to other co...
17856	surface_outflow_submit_no_treaty	Surface water: outflow to other countries not ...
18432	surface_outflow_submit_treaty	Surface water: outflow to other countries subm...
19008	surface_outflow_secure_treaty	Surface water: outflow to other countries secu...
19584	surface_total_external_renewable	Surface water: total external renewable (10^9 ...
20160	groundwater_entering	Groundwater: entering the country (total) (10^...
20736	groundwater_accounted_inflow	Groundwater: accounted inflow (10^9 m3/year)
21312	groundwater_to_other_countries	Groundwater: leaving the country to other coun...
21888	groundwater_accounted_outflow	Groundwater: accounted outflow to other countr...
22464	water_total_external_renewable	Water resources: total external renewable (10^...
23040	total_renewable_surface	Total renewable surface water (10^9 m3/year)
23616	total_renewable_groundwater	Total renewable groundwater (10^9 m3/year)
24192	overlap_surface_groundwater	Overlap: between surface water and groundwater...
24768	total_renewable	Total renewable water resources (10^9 m3/year)
25344	dependency_ratio	Dependency ratio (%)
25920	total_renewable_per_capita	Total renewable water resources per capita (m3...
29376	interannual_variability	Interannual variability (WRI) (-)
29952	seasonal_variability	Seasonal variability (WRI) (-)
30528	total_dam_capacity	Total dam capacity (km3)
31104	dam_capacity_per_capita	Dam capacity per capita (m3/inhab)
31680	irrigation_potential	Irrigation potential (1000 ha)
32256	flood_occurence	Flood occurrence (WRI) (-)
32832	total_pop_access_drinking	Total population with access to safe drinking-...
33408	rural_pop_access_drinking	Rural population with access to safe drinking-...
33984	urban_pop_access_drinking	Urban population with access to safe drinking-...

2013-2017	total_pop	urban_pop	rural_pop
country
Qatar	2235.00	2333.00	-98.00
Singapore	5604.00	5619.00	-15.00
Monaco	37.73	38.32	-0.59
Holy See	0.80	0.80	0.00
Nauru	10.22	10.12	0.10

	total_pop	urban_pop	rural_pop
year_measured
1962	56.19	48.39	7.80
1967	86.16	75.48	10.68
1972	130.40	115.60	14.80
1977	182.40	162.40	20.00
1982	277.20	248.60	28.60
1987	423.30	385.40	37.90
1992	489.70	459.10	30.60
1997	528.20	506.50	21.70
2002	634.40	608.90	25.50
2007	1179.00	1130.00	49.00
2012	2016.00	2029.00	-13.00
2015	2235.00	2333.00	-98.00

time_period	1958-1962	1963-1967	1968-1972	1973-1977	1978-1982	1983-1987	1988-1992	1993-1997	1998-2002	2003-2007	2008-2012	2013-2017
country
Antigua and Barbuda	0.052	0.052	0.052	0.052	0.052	0.052	0.052	0.052	0.052	0.052	0.052	0.052
Bahamas	0.700	0.700	0.700	0.700	0.700	0.700	0.700	0.700	0.700	0.700	0.700	0.700
Barbados	0.080	0.080	0.080	0.080	0.080	0.080	0.080	0.080	0.080	0.080	0.080	0.080
Belize	21.730	21.730	21.730	21.730	21.730	21.730	21.730	21.730	21.730	21.730	21.730	21.730
Canada	2902.000	2902.000	2902.000	2902.000	2902.000	2902.000	2902.000	2902.000	2902.000	2902.000	2902.000	2902.000

Table of Contents