In [1]:
PYTHON_VERSION = 3 ## change to 2 if using Python 2.x
DOWNLOAD_DATA = False ## change to True if you do not have the csv files on your machine
In [2]:
import pandas as pd
from zipfile import ZipFile
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython import display
from time import sleep
%matplotlib inline
if PYTHON_VERSION == 2:
from StringIO import StringIO
from urllib import urlopen
else:
from io import BytesIO as StringIO
from urllib.request import urlopen
sns.set_context("talk", font_scale=2, rc={"lines.linewidth": 4})
In [9]:
country = 'Country Name'
countries = ['United States', 'China', 'India', 'Ireland']
def get_worldbank_dataframe(indicator_id):
if DOWNLOAD_DATA == True:
url = urlopen('http://api.worldbank.org/v2/en/indicator/{0}?downloadformat=csv'.format(indicator_id))
archive = ZipFile(StringIO(url.read()))
print(archive.namelist())
fname = [f for f in archive.namelist() if not f.startswith('Metadata')][0]
data = StringIO(archive.read(fname))
else:
data = '{0}/{0}.csv'.format(indicator_id)
return pd.read_csv(data, skiprows=4)
def rearrange_dataframe(df, indicator_name):
years = [c for c in df.columns if c[0] == '1' or c[0] == '2']
# print(years)
indicator = 'Life expectancy at birth'
df = pd.melt(df[[country] + years], id_vars=country, var_name='year')
## https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html
df.rename(columns={'value': indicator_name}, inplace=True)
df.set_index(['year', country], inplace=True)
return df
def plot_indicator(df, indicator_name):
fig, ax = plt.subplots(figsize=[15 ,10])
for label, dfi in df.groupby(level=1):
dfi[indicator].plot(ax=ax, label=label)
plt.legend()
ax.set_ylabel(indicator)
ax.set_xticklabels(df1c.index.levels[0].values)
ax.set_xlabel('year')
#ax.set_xlim(pd.Timestamp('1960'), pd.Timestamp('1964'))
In [6]:
indicator = 'SP.DYN.LE00.IN' if DOWNLOAD_DATA else 'API_SP.DYN.LE00.IN_DS2_en_csv_v2_713010'
df1 = get_worldbank_dataframe(indicator)
df1.head()
Out[6]:
In [7]:
indicator = 'Life expectency at birth'
df1r = rearrange_dataframe(df1, indicator)
df1r.sort_index(ascending=True, inplace=True)
df1c = df1r.loc[(slice(None), countries),:]
df1c.head()
Out[7]:
In [10]:
plot_indicator(df1c, indicator)
In [11]:
indicator = 'SP.POP.TOTL' if DOWNLOAD_DATA else 'API_SP.POP.TOTL_DS2_en_csv_v2_713131'
df2 = get_worldbank_dataframe(indicator)
df2.head()
Out[11]:
In [12]:
indicator = 'Total population'
df2r = rearrange_dataframe(df2, indicator)
df2r.sort_index(ascending=True, inplace=True)
df2c = df2r.loc[(slice(None), countries),:]
df2c.head()
Out[12]:
In [14]:
plot_indicator(df2c, indicator)
Data from: http://api.worldbank.org/v2/en/indicator/NY.GDP.PCAP.CD?downloadformat=csv
In [15]:
indicator = 'NY.GDP.PCAP.CD' if DOWNLOAD_DATA else 'API_NY.GDP.PCAP.CD_DS2_en_csv_v2_713080'
df3 = get_worldbank_dataframe(indicator)
df3.head()
Out[15]:
In [16]:
indicator = 'GDP per capita'
df3r = rearrange_dataframe(df3, indicator)
df3r.sort_index(ascending=True, inplace=True)
df3c = df3r.loc[(slice(None), countries),:]
df3c.head()
Out[16]:
In [17]:
plot_indicator(df3c, indicator)
In [18]:
df4 = df1c.merge(df2c, left_index=True, right_index=True).merge(df3c, left_index=True, right_index=True)
df4.head()
Out[18]:
In [20]:
x = 'GDP per capita'
y = 'Life expectency at birth'
s = 'Total population'
fig, ax = plt.subplots(figsize=[15 ,10])
for c in countries:
g = df4.loc['1980', c]
ax.plot(g[x], g[y], marker='o', linestyle='', ms=g[s] / 1e7, label=c)
lgnd = ax.legend()
for i in range(len(countries)):
lgnd.legendHandles[i]._legmarker.set_markersize(20)
ax.set_xlabel(x)
ax.set_ylabel(y)
ax.set_xlim([0, df4[x].max()])
ax.set_ylim([df4[y].min()*0.9, df4[y].max()*1.1])
ax.set_title('1980')
Out[20]:
In [21]:
def wb_scatter(df, year):
current_palette = sns.color_palette()
rng = float(df.index.levels[0].max()) - float(df.index.levels[0].min())
cur = float(year) - float(df.index.levels[0].min())
alpha = (cur / rng) / 2
for i, c in enumerate(countries):
g = df.loc[year, c]
ax.plot(g[x], g[y], marker='o', linestyle='', color=current_palette[i], ms=g[s] / 1e7, label=c, alpha=alpha)
ax.set_xlabel(x)
ax.set_ylabel(y)
ax.set_xlim([0, df[x].max()])
ax.set_ylim([df[y].min()*0.9, df[y].max()*1.1])
ax.set_title(year)
# Animated version
fig, ax = plt.subplots(figsize=[15 ,10])
do_legend = True
for year in df4.index.levels[0].values:
wb_scatter(df4, year)
if do_legend:
lgnd = ax.legend()
for i in range(len(countries)):
lgnd.legendHandles[i]._legmarker.set_markersize(20)
lgnd.legendHandles[i]._legmarker.set_alpha(1.0)
do_legend = False
display.clear_output(wait=True)
display.display(plt.gcf())
sleep(0.01)
In [ ]:
In [ ]: