In [1]:
# This is my notebook for exploring data about economic inequality in Cambodia.
%matplotlib inline
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import qgrid
from pylab import *
import seaborn as sb
# Hey, good news! We can remotely access the World Bank's World Development Indicators Database
# directly from pandas!
from pandas.io import wb
In [2]:
# First, search the database for all poverty-related indicator names and store them.
# I didn't use qgrid because it wouldn't display the id column correctly for some reason. It'd look nicer if it worked, though.
pov = wb.search('pov.*%').iloc[:,:2]
pov
Out[2]:
In [3]:
# Strip the English labels from the id's and store them in a separate table
povnames = pov.loc[7529:7550, 'name']
povnames = povnames.tolist()
# Keep only the id's in the original pov table
pov = pov.loc[7529:7550, 'id']
pov = pov.tolist()
# Take a look
povnames
Out[3]:
In [4]:
pov
Out[4]:
In [5]:
# Create a dictionary of the names and id's
povdict = dict(zip(pov, povnames))
povdict
Out[5]:
In [6]:
# Now, look for all income related indicators and store them
inc = wb.search('income.*share.*%').iloc[:,:2]
inc
Out[6]:
In [7]:
# Repeat what was done with the poverty indicators
incnames = inc.loc[:, 'name']
incnames = incnames.tolist()
inc = inc.loc[:,'id']
inc = inc.tolist()
incnames
Out[7]:
In [8]:
inc
Out[8]:
In [9]:
# Create another dictionary for income
incdict = dict(zip(inc, incnames))
incdict
Out[9]:
In [10]:
# Create master list of all of the data we want to download:
idx = pov + inc
idx
Out[10]:
In [11]:
# Download data and store it as a DataFrame
khm = wb.download(indicator=idx, country='KHM', start=2004, end=2012)
khm
Out[11]:
In [12]:
# Reverse the order of the DataFrame so the years are ascending, drop Cambodia index, drop categories with all NA's
khm.index = khm.index.droplevel(0)
khm = khm.iloc[::-1]
khm = khm.dropna(axis=1, how='all')
qgrid.show_grid(khm, remote_js=True)
In [13]:
# Let's look at the data of percent of total income earned from the highest 10% next to that of
# the lowest 10%
incframe = khm[['SI.DST.10TH.10', 'SI.DST.FRST.10']]
incframe = incframe[0:8] # No data for 2012, so let's omit it
incframe.columns = [incdict[incframe.columns.tolist()[0]], incdict[incframe.columns.tolist()[1]]]
incframe
Out[13]:
In [14]:
incframe.columns = ['Highest 10% of Income Earners', 'Lowest 10% of Income Earners']
plt.figure()
incframe.plot(title = 'Income Share in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
Out[14]:
In [15]:
incframe1 = khm[['SI.DST.FRST.20', 'SI.DST.02ND.20', 'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20'
]]
# Loop to look up ids in dictionary to rename the columns
newcolumns = range(0,len(incframe1.columns))
for i in range(0, len(incframe1.columns)):
newcolumns[i] = incdict[incframe1.columns.tolist()[i]]
incframe1.columns = newcolumns
incframe1 = incframe1[0:8] #Omit 2012, no data
incframe1
Out[15]:
In [16]:
# Change column names
incframe1.columns = ['Lowest 20%', 'Second Lowest 20%', 'Middle 20%', 'Second Highest 20%', 'Highest 20%']
plt.figure()
incframe1.plot(title = 'Income Share in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
plt.ylim([0, 50])
Out[16]:
In [17]:
# Stacked bar graph of above dataframe
plt.figure()
incframe1.plot(title = 'Income Share in Cambodia', kind = 'bar', stacked = 'True')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
Out[17]:
In [18]:
# How does this compare to the US?
usa = wb.download(indicator=['SI.DST.10TH.10', 'SI.DST.FRST.10', 'SI.DST.FRST.20', 'SI.DST.02ND.20',
'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20'],
country='USA', start=2004, end=2011)
usa.index = usa.index.droplevel(0)
usa = usa.iloc[::-1]
qgrid.show_grid(usa, remote_js=True)
In [19]:
# So there isn't nearly as much data for the US during this time period, but it's still worth looking at.
usainc = usa[['SI.DST.10TH.10', 'SI.DST.FRST.10']]
usainc.columns = ['Highest 10% of Income Earners', 'Lowest 10% of Income Earners']
plt.figure()
usainc.plot(title = 'Income Share in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
Out[19]:
In [20]:
usainc1 = usa[['SI.DST.FRST.20', 'SI.DST.02ND.20', 'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20']]
usainc1.columns = ['Lowest 20%', 'Second Lowest 20%', 'Middle 20%', 'Second Highest 20%', 'Highest 20%']
plt.figure()
usainc1.plot(title = 'Income Share in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
Out[20]:
In [21]:
plt.figure()
usainc1.plot(title = 'Income Share in the USA', kind = 'bar', stacked = 'True')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
Out[21]:
In [27]:
# So... we aren't much better (perhaps worse). Let's look at the Gross National Income per capita for each
# country to get a better look at the differences
khmgni = wb.download(indicator=['NY.GNP.PCAP.CD'], country='KHM', start=2004, end=2011)
khmgni.index = khmgni.index.droplevel(0)
khmgni = khmgni.iloc[::-1]
qgrid.show_grid(khmgni, remote_js=True)
In [28]:
plt.figure()
khmgni.plot(title = 'Gross National Income per Capita in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Income ($)')
Out[28]:
In [29]:
usagni = wb.download(indicator=['NY.GNP.PCAP.CD'], country='USA', start=2004, end=2011)
usagni.index = usagni.index.droplevel(0)
usagni = usagni.iloc[::-1]
qgrid.show_grid(usagni, remote_js=True)
In [30]:
plt.figure()
usagni.plot(title = 'Gross National Income per Capita in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Income ($)')
Out[30]:
In [24]:
# Explore how close people are to the poverty boundary and how it is changing over time
povline = khm[['SI.POV.25DAY', 'SI.POV.2DAY', 'SI.POV.4DAY', 'SI.POV.5DAY', 'SI.POV.DDAY', 'SI.POV.NAHC']]
newcolumns = range(0,len(povline.columns))
for i in range(0, len(povline.columns)):
newcolumns[i] = povdict[povline.columns.tolist()[i]]
povline.columns = newcolumns
povline
Out[24]:
In [31]:
# Change titles of columns for plotting, then plot
povline.columns = ['<$2.50 a day', '<$2 a day', '<$4 a day', '<$5 a day', '<$1.25 a day', '<National poverty lines']
plt.figure()
povline.plot(title='Poverty Headcount Ratio at Different Incomes in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('% of population')
Out[31]:
In [26]:
# The World Bank does not have most of this data for the USA, so I will find US Census data later.